Complete Model

Now we stack everything together: embeddings → transformer blocks → output layer.

python
1class CalculatorLLM(nn.Module):
2 """A tiny transformer LLM for solving English math problems."""
3
4 def __init__(
5 self,
6 vocab_size: int,
7 embed_dim: int,
8 num_heads: int,
9 num_layers: int,
10 ff_dim: int,
11 max_seq_len: int,
12 dropout: float = 0.1,
13 ):
14 super().__init__()
15 self.max_seq_len = max_seq_len
16 self.embedding = TokenEmbedding(vocab_size, embed_dim, max_seq_len, dropout)
17 self.layers = nn.ModuleList(
18 [
19 TransformerBlock(embed_dim, num_heads, ff_dim, dropout)
20 for _ in range(num_layers)
21 ]
22 )
23 self.norm = nn.LayerNorm(embed_dim)
24 self.output_proj = nn.Linear(embed_dim, vocab_size)
25
26 def forward(
27 self, x: torch.Tensor, mask: torch.Tensor | None = None
28 ) -> torch.Tensor:
29 if mask is None:
30 seq_len = x.size(1)
31 mask = create_causal_mask(seq_len).to(x.device)
32
33 x = self.embedding(x)
34 for layer in self.layers:
35 x, _ = layer(x, mask)
36 x = self.norm(x)
37 return self.output_proj(x)
38
39 def count_parameters(self) -> int:
40 """Count total trainable parameters."""
41 return sum(p.numel() for p in self.parameters() if p.requires_grad)
ComponentOur ModelGPT-2 Small
Vocab size3650,257
Embed dim64768
Num heads412
Num layers212
Total params~50K~124M
Helpful?