Commit
·
2120bf6
1
Parent(s):
326b359
Fixes
Browse files
Model_Architecture/config.json
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
"max_seq_len": 2048,
|
| 5 |
"dtype": "bf16",
|
| 6 |
"scale_fmt": null,
|
| 7 |
-
"vocab_size":
|
| 8 |
"dim": 1024,
|
| 9 |
"inter_dim": 4096,
|
| 10 |
"moe_inter_dim": 1024,
|
|
@@ -27,7 +27,7 @@
|
|
| 27 |
"beta_fast": 32,
|
| 28 |
"beta_slow": 1,
|
| 29 |
"mscale": 1.0,
|
| 30 |
-
"tokenizer_name": "
|
| 31 |
},
|
| 32 |
"training": {
|
| 33 |
"learning_rate": 3e-4,
|
|
|
|
| 4 |
"max_seq_len": 2048,
|
| 5 |
"dtype": "bf16",
|
| 6 |
"scale_fmt": null,
|
| 7 |
+
"vocab_size": 32768,
|
| 8 |
"dim": 1024,
|
| 9 |
"inter_dim": 4096,
|
| 10 |
"moe_inter_dim": 1024,
|
|
|
|
| 27 |
"beta_fast": 32,
|
| 28 |
"beta_slow": 1,
|
| 29 |
"mscale": 1.0,
|
| 30 |
+
"tokenizer_name": "turkish"
|
| 31 |
},
|
| 32 |
"training": {
|
| 33 |
"learning_rate": 3e-4,
|
Model_Architecture/generation.py
CHANGED
|
@@ -164,7 +164,7 @@ if __name__ == "__main__":
|
|
| 164 |
from pathlib import Path
|
| 165 |
|
| 166 |
# Configuration: Set to True to use Turkish tokenizer, False for tiktoken
|
| 167 |
-
USE_TURKISH_TOKENIZER =
|
| 168 |
|
| 169 |
# Example configuration - smaller model for testing
|
| 170 |
config_path = Path("config.json")
|
|
@@ -179,15 +179,20 @@ if __name__ == "__main__":
|
|
| 179 |
|
| 180 |
# Initialize tokenizer
|
| 181 |
tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
|
|
|
|
|
|
|
|
|
|
| 182 |
tokenizer = get_tokenizer(
|
| 183 |
-
use_turkish=
|
| 184 |
-
tokenizer_name=tokenizer_name
|
| 185 |
)
|
| 186 |
|
| 187 |
# Update vocab size if using Turkish tokenizer
|
| 188 |
-
if
|
| 189 |
-
args.vocab_size
|
| 190 |
-
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# Initialize model
|
| 193 |
print("Initializing model...")
|
|
|
|
| 164 |
from pathlib import Path
|
| 165 |
|
| 166 |
# Configuration: Set to True to use Turkish tokenizer, False for tiktoken
|
| 167 |
+
USE_TURKISH_TOKENIZER = True # Change this to False for English text generation
|
| 168 |
|
| 169 |
# Example configuration - smaller model for testing
|
| 170 |
config_path = Path("config.json")
|
|
|
|
| 179 |
|
| 180 |
# Initialize tokenizer
|
| 181 |
tokenizer_name = getattr(args, "tokenizer_name", "gpt2")
|
| 182 |
+
# Auto-detect Turkish tokenizer from config
|
| 183 |
+
use_turkish = (tokenizer_name.lower() == "turkish") or USE_TURKISH_TOKENIZER
|
| 184 |
+
|
| 185 |
tokenizer = get_tokenizer(
|
| 186 |
+
use_turkish=use_turkish,
|
| 187 |
+
tokenizer_name="gpt2" if use_turkish else tokenizer_name
|
| 188 |
)
|
| 189 |
|
| 190 |
# Update vocab size if using Turkish tokenizer
|
| 191 |
+
if use_turkish and isinstance(tokenizer, TurkishTokenizerWrapper):
|
| 192 |
+
if args.vocab_size != tokenizer.n_vocab:
|
| 193 |
+
print(f"⚠️ Config vocab_size ({args.vocab_size:,}) doesn't match tokenizer ({tokenizer.n_vocab:,})")
|
| 194 |
+
args.vocab_size = tokenizer.n_vocab
|
| 195 |
+
print(f"📊 Updated vocab_size to {args.vocab_size:,} for Turkish tokenizer")
|
| 196 |
|
| 197 |
# Initialize model
|
| 198 |
print("Initializing model...")
|
Model_Architecture/model.py
CHANGED
|
@@ -190,7 +190,8 @@ class RMSNorm(nn.Module):
|
|
| 190 |
self.weight = nn.Parameter(torch.ones(dim))
|
| 191 |
|
| 192 |
def forward(self, x: torch.Tensor):
|
| 193 |
-
|
|
|
|
| 194 |
|
| 195 |
|
| 196 |
#####################################
|
|
@@ -228,8 +229,8 @@ class MultiHeadLatentAttention(nn.Module):
|
|
| 228 |
self.softmax_scale = self.softmax_scale * mscale * mscale
|
| 229 |
|
| 230 |
|
| 231 |
-
self.register_buffer("kv_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.kv_lora_rank), persistent=False)
|
| 232 |
-
self.register_buffer("pe_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.qk_rope_head_dim), persistent=False)
|
| 233 |
|
| 234 |
def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
|
| 235 |
|
|
@@ -280,7 +281,7 @@ class Gate(nn.Module):
|
|
| 280 |
self.route_scale = args.route_scale
|
| 281 |
|
| 282 |
# Gate weight
|
| 283 |
-
self.weight = nn.Parameter(torch.empty(args.n_routed_experts, args.dim))
|
| 284 |
|
| 285 |
# Optional routing bias for fine-tuning expert selection
|
| 286 |
if args.use_routing_bias:
|
|
@@ -509,7 +510,7 @@ class ismail(nn.Module):
|
|
| 509 |
|
| 510 |
def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
|
| 511 |
bsz, seqlen = tokens.shape
|
| 512 |
-
h = self.tok_embeddings(tokens)
|
| 513 |
freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
|
| 514 |
|
| 515 |
# Create causal mask
|
|
|
|
| 190 |
self.weight = nn.Parameter(torch.ones(dim))
|
| 191 |
|
| 192 |
def forward(self, x: torch.Tensor):
|
| 193 |
+
output = F.rms_norm(x, (self.dim,), self.weight, self.eps)
|
| 194 |
+
return output.to(x.dtype)
|
| 195 |
|
| 196 |
|
| 197 |
#####################################
|
|
|
|
| 229 |
self.softmax_scale = self.softmax_scale * mscale * mscale
|
| 230 |
|
| 231 |
|
| 232 |
+
self.register_buffer("kv_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.kv_lora_rank, dtype=Linear.dtype), persistent=False)
|
| 233 |
+
self.register_buffer("pe_cache", torch.zeros(args.max_batch_size, args.max_seq_len, self.qk_rope_head_dim, dtype=Linear.dtype), persistent=False)
|
| 234 |
|
| 235 |
def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
|
| 236 |
|
|
|
|
| 281 |
self.route_scale = args.route_scale
|
| 282 |
|
| 283 |
# Gate weight
|
| 284 |
+
self.weight = nn.Parameter(torch.empty(args.n_routed_experts, args.dim, dtype=Linear.dtype))
|
| 285 |
|
| 286 |
# Optional routing bias for fine-tuning expert selection
|
| 287 |
if args.use_routing_bias:
|
|
|
|
| 510 |
|
| 511 |
def forward(self, tokens: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
|
| 512 |
bsz, seqlen = tokens.shape
|
| 513 |
+
h = self.tok_embeddings(tokens).to(Linear.dtype)
|
| 514 |
freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
|
| 515 |
|
| 516 |
# Create causal mask
|