FaiziRBLX commited on
Commit
110b8ce
·
verified ·
1 Parent(s): 8df10a1

Update best.py

Browse files
Files changed (1) hide show
  1. best.py +48 -17
best.py CHANGED
@@ -50,11 +50,10 @@ class ModelConfig:
50
  num_layers: int = 12
51
  num_attention_heads: int = 6
52
  num_key_value_heads: int = 2 # GQA
53
- # intermediate_size is a real stored field so old checkpoints load correctly.
54
- # Default -1 means: derive from hidden_size at __post_init__ time.
55
- # load_model() always overwrites this from the checkpoint's state dict shapes,
56
- # so a checkpoint trained with intermediate=960 will always load as 960.
57
- intermediate_size: int = -1
58
  max_position_embeddings: int = 2048
59
  rms_norm_eps: float = 1e-6
60
  rope_theta: float = 10000.0
@@ -68,10 +67,8 @@ class ModelConfig:
68
  label_smoothing: float = 0.1
69
 
70
  def __post_init__(self):
71
- # Derive intermediate_size when not explicitly set
72
- if self.intermediate_size == -1:
73
- # Original code used hidden * 3; keep that as default so new
74
- # training runs match the formula the user originally chose.
75
  self.intermediate_size = self.hidden_size * 3
76
  assert self.hidden_size % self.num_attention_heads == 0, \
77
  f"hidden_size {self.hidden_size} not divisible by num_heads {self.num_attention_heads}"
@@ -294,11 +291,16 @@ class GroupedQueryAttention(nn.Module):
294
  class SwiGLUMLP(nn.Module):
295
  def __init__(self, config: ModelConfig):
296
  super().__init__()
297
- self.hidden_size = config.hidden_size
298
- self.intermediate_size = config.intermediate_size # now a @property
299
- self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
300
- self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
301
- self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
 
 
 
 
 
302
 
303
  def forward(self, x):
304
  return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
@@ -1068,6 +1070,15 @@ def generate_text(
1068
  """
1069
  model.eval()
1070
 
 
 
 
 
 
 
 
 
 
1071
  eos_id = tokenizer.eos_token_id or tokenizer.sep_token_id or 2
1072
  pad_id = tokenizer.pad_token_id or 0
1073
 
@@ -1281,7 +1292,14 @@ def run_benchmark(model, tokenizer, device, dataset_path: str = None, n: int = 2
1281
  print("No valid samples.")
1282
  return
1283
 
1284
- random.seed(42)
 
 
 
 
 
 
 
1285
  samples = random.sample(all_samples, min(n, len(all_samples)))
1286
  model.eval()
1287
 
@@ -1296,7 +1314,7 @@ def run_benchmark(model, tokenizer, device, dataset_path: str = None, n: int = 2
1296
  prompt = f"{inp} <cot>"
1297
 
1298
  full = generate_text(model, tokenizer, prompt=prompt, max_new_tokens=150,
1299
- temperature=0.3, top_k=20, top_p=0.9, device=device)
1300
  raw = full[len(prompt):].strip()
1301
  _, answer = _extract_thinking(raw)
1302
  answer_lower = answer.lower()
@@ -1448,7 +1466,20 @@ def main():
1448
  save_fp16 = not args.save_fp32
1449
  use_cot_training = not args.no_cot
1450
 
1451
- set_seed(args.seed)
 
 
 
 
 
 
 
 
 
 
 
 
 
1452
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
1453
  print(f"\nDevice: {device}")
1454
  if torch.cuda.is_available():
 
50
  num_layers: int = 12
51
  num_attention_heads: int = 6
52
  num_key_value_heads: int = 2 # GQA
53
+ # Stored as a plain int field NEVER a @property — so pickle round-trips work.
54
+ # 0 = unset (load_model will fill it from checkpoint weight shapes).
55
+ # New training always passes this explicitly from len(tokenizer) / hidden_size.
56
+ intermediate_size: int = 0
 
57
  max_position_embeddings: int = 2048
58
  rms_norm_eps: float = 1e-6
59
  rope_theta: float = 10000.0
 
67
  label_smoothing: float = 0.1
68
 
69
  def __post_init__(self):
70
+ # Set intermediate_size only when not already provided
71
+ if self.intermediate_size <= 0:
 
 
72
  self.intermediate_size = self.hidden_size * 3
73
  assert self.hidden_size % self.num_attention_heads == 0, \
74
  f"hidden_size {self.hidden_size} not divisible by num_heads {self.num_attention_heads}"
 
291
  class SwiGLUMLP(nn.Module):
292
  def __init__(self, config: ModelConfig):
293
  super().__init__()
294
+ self.hidden_size = config.hidden_size
295
+ # Read intermediate_size defensively: if somehow 0 or negative (e.g. old
296
+ # unpickled config that missed __post_init__), fall back to hidden * 3.
297
+ inter = getattr(config, 'intermediate_size', 0)
298
+ if not isinstance(inter, int) or inter <= 0:
299
+ inter = self.hidden_size * 3
300
+ self.intermediate_size = inter
301
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
302
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
303
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
304
 
305
  def forward(self, x):
306
  return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
 
1070
  """
1071
  model.eval()
1072
 
1073
+ # Reseed from OS entropy so repeated calls with the same prompt diverge.
1074
+ # This is the core fix: torch.multinomial outcome depends on torch RNG state,
1075
+ # which was frozen to seed=42 at startup. Each call now starts from a unique state.
1076
+ import os as _os
1077
+ _entropy = int.from_bytes(_os.urandom(4), 'little')
1078
+ torch.manual_seed(_entropy)
1079
+ if torch.cuda.is_available():
1080
+ torch.cuda.manual_seed_all(_entropy)
1081
+
1082
  eos_id = tokenizer.eos_token_id or tokenizer.sep_token_id or 2
1083
  pad_id = tokenizer.pad_token_id or 0
1084
 
 
1292
  print("No valid samples.")
1293
  return
1294
 
1295
+ # Time-based seed: different sample selection AND different generation each run
1296
+ import time
1297
+ live_seed = int(time.time() * 1000) % (2**31)
1298
+ random.seed(live_seed)
1299
+ torch.manual_seed(live_seed)
1300
+ if torch.cuda.is_available():
1301
+ torch.cuda.manual_seed_all(live_seed)
1302
+
1303
  samples = random.sample(all_samples, min(n, len(all_samples)))
1304
  model.eval()
1305
 
 
1314
  prompt = f"{inp} <cot>"
1315
 
1316
  full = generate_text(model, tokenizer, prompt=prompt, max_new_tokens=150,
1317
+ temperature=0.7, top_k=40, top_p=0.92, device=device)
1318
  raw = full[len(prompt):].strip()
1319
  _, answer = _extract_thinking(raw)
1320
  answer_lower = answer.lower()
 
1466
  save_fp16 = not args.save_fp32
1467
  use_cot_training = not args.no_cot
1468
 
1469
+ # Only fix the seed for training (reproducibility).
1470
+ # Chat and benchmark must NOT be seeded — identical seeds produce identical
1471
+ # outputs every run, making the model feel like a lookup table.
1472
+ if args.train or args.finetune or args.continue_train:
1473
+ set_seed(args.seed)
1474
+ else:
1475
+ # Use a time-based seed so every run is different
1476
+ import time
1477
+ live_seed = int(time.time() * 1000) % (2**31)
1478
+ random.seed(live_seed)
1479
+ np.random.seed(live_seed)
1480
+ torch.manual_seed(live_seed)
1481
+ if torch.cuda.is_available():
1482
+ torch.cuda.manual_seed_all(live_seed)
1483
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
1484
  print(f"\nDevice: {device}")
1485
  if torch.cuda.is_available():