How to Design Complex Deep Learning Tensor Pipelines Using Einops with Vision, Attention, and Multimodal Examples

How to Design Complex Deep Learning Tensor Pipelines Using Einops with Vision, Attention, and Multimodal Examples


section(“6) pack unpack”)
B, Cemb = 2, 128

class_token = torch.randn(B, 1, Cemb, device=device)
image_tokens = torch.randn(B, 196, Cemb, device=device)
text_tokens = torch.randn(B, 32, Cemb, device=device)
show_shape(“class_token”, class_token)
show_shape(“image_tokens”, image_tokens)
show_shape(“text_tokens”, text_tokens)

packed, ps = pack([class_token, image_tokens, text_tokens], “b * c”)
show_shape(“packed”, packed)
print(“packed_shapes (ps):”, ps)

mixer = nn.Sequential(
nn.LayerNorm(Cemb),
nn.Linear(Cemb, 4 * Cemb),
nn.GELU(),
nn.Linear(4 * Cemb, Cemb),
).to(device)

mixed = mixer(packed)
show_shape(“mixed”, mixed)

class_out, image_out, text_out = unpack(mixed, ps, “b * c”)
show_shape(“class_out”, class_out)
show_shape(“image_out”, image_out)
show_shape(“text_out”, text_out)
assert class_out.shape == class_token.shape
assert image_out.shape == image_tokens.shape
assert text_out.shape == text_tokens.shape

section(“7) layers”)
class PatchEmbed(nn.Module):
def __init__(self, in_channels=3, emb_dim=192, patch=8):
super().__init__()
self.patch = patch
self.to_patches = Rearrange(“b c (h p1) (w p2) -> b (h w) (p1 p2 c)”, p1=patch, p2=patch)
self.proj = nn.Linear(in_channels * patch * patch, emb_dim)

def forward(self, x):
x = self.to_patches(x)
return self.proj(x)

class SimpleVisionHead(nn.Module):
def __init__(self, emb_dim=192, num_classes=10):
super().__init__()
self.pool = Reduce(“b t c -> b c”, reduction=”mean”)
self.classifier = nn.Linear(emb_dim, num_classes)

def forward(self, tokens):
x = self.pool(tokens)
return self.classifier(x)

patch_embed = PatchEmbed(in_channels=3, emb_dim=192, patch=8).to(device)
head = SimpleVisionHead(emb_dim=192, num_classes=10).to(device)

imgs = torch.randn(4, 3, 32, 32, device=device)
tokens = patch_embed(imgs)
logits = head(tokens)
show_shape(“tokens”, tokens)
show_shape(“logits”, logits)

section(“8) practical”)
x = torch.randn(2, 32, 16, 16, device=device)
g = 8
xg = rearrange(x, “b (g cg) h w -> (b g) cg h w”, g=g)
show_shape(“x”, x)
show_shape(“xg”, xg)

mean = reduce(xg, “bg cg h w -> bg 1 1 1”, “mean”)
var = reduce((xg – mean) ** 2, “bg cg h w -> bg 1 1 1”, “mean”)
xg_norm = (xg – mean) / torch.sqrt(var + 1e-5)
x_norm = rearrange(xg_norm, “(b g) cg h w -> b (g cg) h w”, b=2, g=g)
show_shape(“x_norm”, x_norm)

z = torch.randn(3, 64, 20, 30, device=device)
z_flat = rearrange(z, “b c h w -> b c (h w)”)
z_unflat = rearrange(z_flat, “b c (h w) -> b c h w”, h=20, w=30)
assert (z – z_unflat).abs().max().item() < 1e-6
show_shape(“z_flat”, z_flat)

section(“9) views”)
a = torch.randn(2, 3, 4, 5, device=device)
b = rearrange(a, “b c h w -> b h w c”)
print(“a.is_contiguous():”, a.is_contiguous())
print(“b.is_contiguous():”, b.is_contiguous())
print(“b._base is a:”, getattr(b, “_base”, None) is a)

section(“Done ✅ You now have reusable einops patterns for vision, attention, and multimodal token packing”)



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *

Pin It on Pinterest