ikaganacar commited on
Commit
2120bf6
·
1 Parent(s): 326b359
Model_Architecture/config.json CHANGED
@@ -4,7 +4,7 @@
4
  "max_seq_len": 2048,
5
  "dtype": "bf16",
6
  "scale_fmt": null,
7
- "vocab_size": 102400,
8
  "dim": 1024,
9
  "inter_dim": 4096,
10
  "moe_inter_dim": 1024,
@@ -27,7 +27,7 @@
27
  "beta_fast": 32,
28
  "beta_slow": 1,
29
  "mscale": 1.0,
30
- "tokenizer_name": "gpt2"
31
  },
32
  "training": {
33
  "learning_rate": 3e-4,
 
4
  "max_seq_len": 2048,
5
  "dtype": "bf16",
6
  "scale_fmt": null,
7
+ "vocab_size": 32768,
8
  "dim": 1024,
9
  "inter_dim": 4096,
10
  "moe_inter_dim": 1024,
 
27
  "beta_fast": 32,
28
  "beta_slow": 1,
29
  "mscale": 1.0,
30
+ "tokenizer_name": "turkish"
31
  },
32
  "training": {
33
  "learning_rate": 3e-4,
Model_Architecture/generation.py CHANGED
@@ -164,7 +164,7 @@ if __name__ == "__main__":
164
  from pathlib import Path
165
 
166
  # Configuration: Set to True to use Turkish tokenizer, False for tiktoken
167
- USE_TURKISH_TOKENIZER = False # Change this to True for Turkish text generation
168
 
169
  # Example configuration - smaller model for testing
170
  config_path = Path("config.json")
@@ -179,15 +179,20 @@ if __name__ == "__main__":
179
 
180
  # Initialize tokenizer
181
  tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
 
 
 
182
  tokenizer = get_tokenizer(
183
- use_turkish=USE_TURKISH_TOKENIZER,
184
- tokenizer_name=tokenizer_name
185
  )
186
 
187
  # Update vocab size if using Turkish tokenizer
188
- if USE_TURKISH_TOKENIZER and isinstance(tokenizer, TurkishTokenizerWrapper):
189
- args.vocab_size = tokenizer.n_vocab
190
- print(f"📊 Updated vocab_size to {args.vocab_size:,} for Turkish tokenizer")
 
 
191
 
192
  # Initialize model
193
  print("Initializing model...")
 
164
  from pathlib import Path
165
 
166
  # Configuration: Set to True to use Turkish tokenizer, False for tiktoken
167
+ USE_TURKISH_TOKENIZER = True # Change this to False for English text generation
168
 
169
  # Example configuration - smaller model for testing
170
  config_path = Path("config.json")
 
179
 
180
  # Initialize tokenizer
181
  tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
182
+ # Auto-detect Turkish tokenizer from config
183
+ use_turkish = (tokenizer_name.lower() == "turkish") or USE_TURKISH_TOKENIZER
184
+
185
  tokenizer = get_tokenizer(
186
+ use_turkish=use_turkish,
187
+ tokenizer_name="gpt2" if use_turkish else tokenizer_name
188
  )
189
 
190
  # Update vocab size if using Turkish tokenizer
191
+ if use_turkish and isinstance(tokenizer, TurkishTokenizerWrapper):
192
+ if args.vocab_size != tokenizer.n_vocab:
193
+ print(f"⚠️ Config vocab_size ({args.vocab_size:,}) doesn't match tokenizer ({tokenizer.n_vocab:,})")
194
+ args.vocab_size = tokenizer.n_vocab
195
+ print(f"📊 Updated vocab_size to {args.vocab_size:,} for Turkish tokenizer")
196
 
197
  # Initialize model
198
  print("Initializing model...")
Model_Architecture/model.py CHANGED
@@ -190,7 +190,8 @@ class RMSNorm(nn.Module):
190
  self.weight = nn.Parameter(torch.ones(dim))
191
 
192
  def forward(self, x: torch.Tensor):
193
- return F.rms_norm(x, (self.dim,), self.weight, self.eps)
 
194
 
195
 
196
  #####################################
@@ -228,8 +229,8 @@ class MultiHeadLatentAttention(nn.Module):
228
  self.softmax_scale = self.softmax_scale * mscale * mscale
229
 
230
 
231
- self.register_buffer("kv_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.kv_lora_rank), persistent=False)
232
- self.register_buffer("pe_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.qk_rope_head_dim), persistent=False)
233
 
234
  def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
235
 
@@ -280,7 +281,7 @@ class Gate(nn.Module):
280
  self.route_scale = args.route_scale
281
 
282
  # Gate weight
283
- self.weight = nn.Parameter(torch.empty(args.n_routed_experts, args.dim))
284
 
285
  # Optional routing bias for fine-tuning expert selection
286
  if args.use_routing_bias:
@@ -509,7 +510,7 @@ class ismail(nn.Module):
509
 
510
  def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
511
  bsz, seqlen = tokens.shape
512
- h = self.tok_embeddings(tokens)
513
  freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
514
 
515
  # Create causal mask
 
190
  self.weight = nn.Parameter(torch.ones(dim))
191
 
192
  def forward(self, x: torch.Tensor):
193
+ output = F.rms_norm(x, (self.dim,), self.weight, self.eps)
194
+ return output.to(x.dtype)
195
 
196
 
197
  #####################################
 
229
  self.softmax_scale = self.softmax_scale * mscale * mscale
230
 
231
 
232
+ self.register_buffer("kv_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.kv_lora_rank, dtype=Linear.dtype), persistent=False)
233
+ self.register_buffer("pe_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.qk_rope_head_dim, dtype=Linear.dtype), persistent=False)
234
 
235
  def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
236
 
 
281
  self.route_scale = args.route_scale
282
 
283
  # Gate weight
284
+ self.weight = nn.Parameter(torch.empty(args.n_routed_experts, args.dim, dtype=Linear.dtype))
285
 
286
  # Optional routing bias for fine-tuning expert selection
287
  if args.use_routing_bias:
 
510
 
511
  def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
512
  bsz, seqlen = tokens.shape
513
+ h = self.tok_embeddings(tokens).to(Linear.dtype)
514
  freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
515
 
516
  # Create causal mask