rubenaghayan commited on
Commit
ddb0136
·
1 Parent(s): 84f0b80

initial calculator

Browse files
Files changed (3) hide show
  1. calculator.py +0 -0
  2. defaults.py +5 -5
  3. state.py +17 -3
calculator.py ADDED
File without changes
defaults.py CHANGED
@@ -1,10 +1,10 @@
1
  from state import ModelState
2
 
3
- GEMMA3_270M = ModelState(vocab_size=256000, num_layers=9, hidden_size=1152, intermediate_size=4608)
4
- GEMMA3_1B = ModelState(vocab_size=262208, num_layers=26, hidden_size=2304, intermediate_size=9216)
5
- GEMMA3_4B = ModelState(vocab_size=262208, num_layers=28, hidden_size=3072, intermediate_size=12288)
6
- GEMMA3_12B = ModelState(vocab_size=262208, num_layers=42, hidden_size=4608, intermediate_size=18432)
7
- GEMMA3_27B = ModelState(vocab_size=262208, num_layers=46, hidden_size=6144, intermediate_size=24576)
8
 
9
  DEFAULTS = {
10
  "Gemma3 270M": GEMMA3_270M,
 
1
  from state import ModelState
2
 
3
+ GEMMA3_270M = ModelState(vocab_size=256000, num_layers=9, hidden_dim=1152, intermediate_size=4608)
4
+ GEMMA3_1B = ModelState(vocab_size=262208, num_layers=26, hidden_dim=2304, intermediate_size=9216)
5
+ GEMMA3_4B = ModelState(vocab_size=262208, num_layers=28, hidden_dim=3072, intermediate_size=12288)
6
+ GEMMA3_12B = ModelState(vocab_size=262208, num_layers=42, hidden_dim=4608, intermediate_size=18432)
7
+ GEMMA3_27B = ModelState(vocab_size=262208, num_layers=46, hidden_dim=6144, intermediate_size=24576)
8
 
9
  DEFAULTS = {
10
  "Gemma3 270M": GEMMA3_270M,
state.py CHANGED
@@ -1,8 +1,22 @@
1
  from dataclasses import dataclass
2
 
3
  @dataclass
4
- class ModelState:
5
  vocab_size: int
6
  num_layers: int
7
- hidden_size: int
8
- intermediate_size: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from dataclasses import dataclass
2
 
3
  @dataclass
4
+ class Model:
5
  vocab_size: int
6
  num_layers: int
7
+ hidden_dim: int
8
+ intermediate_size: int
9
+ weight_tied_embeddings: bool
10
+
11
+
12
+ @dataclass
13
+ class Parallelism:
14
+ tensor_parallelism: int
15
+ pipeline_parallelism: int
16
+ context_parallelism: int
17
+ expert_parallelism: int
18
+
19
+ @dataclass
20
+ class Training:
21
+ sequence_length: int
22
+ batch_size: int