Update best.py
Browse files
best.py
CHANGED
|
@@ -50,11 +50,10 @@ class ModelConfig:
|
|
| 50 |
num_layers: int = 12
|
| 51 |
num_attention_heads: int = 6
|
| 52 |
num_key_value_heads: int = 2 # GQA
|
| 53 |
-
#
|
| 54 |
-
#
|
| 55 |
-
#
|
| 56 |
-
|
| 57 |
-
intermediate_size: int = -1
|
| 58 |
max_position_embeddings: int = 2048
|
| 59 |
rms_norm_eps: float = 1e-6
|
| 60 |
rope_theta: float = 10000.0
|
|
@@ -68,10 +67,8 @@ class ModelConfig:
|
|
| 68 |
label_smoothing: float = 0.1
|
| 69 |
|
| 70 |
def __post_init__(self):
|
| 71 |
-
#
|
| 72 |
-
if self.intermediate_size =
|
| 73 |
-
# Original code used hidden * 3; keep that as default so new
|
| 74 |
-
# training runs match the formula the user originally chose.
|
| 75 |
self.intermediate_size = self.hidden_size * 3
|
| 76 |
assert self.hidden_size % self.num_attention_heads == 0, \
|
| 77 |
f"hidden_size {self.hidden_size} not divisible by num_heads {self.num_attention_heads}"
|
|
@@ -294,11 +291,16 @@ class GroupedQueryAttention(nn.Module):
|
|
| 294 |
class SwiGLUMLP(nn.Module):
|
| 295 |
def __init__(self, config: ModelConfig):
|
| 296 |
super().__init__()
|
| 297 |
-
self.hidden_size
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
def forward(self, x):
|
| 304 |
return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
|
|
@@ -1068,6 +1070,15 @@ def generate_text(
|
|
| 1068 |
"""
|
| 1069 |
model.eval()
|
| 1070 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1071 |
eos_id = tokenizer.eos_token_id or tokenizer.sep_token_id or 2
|
| 1072 |
pad_id = tokenizer.pad_token_id or 0
|
| 1073 |
|
|
@@ -1281,7 +1292,14 @@ def run_benchmark(model, tokenizer, device, dataset_path: str = None, n: int = 2
|
|
| 1281 |
print("No valid samples.")
|
| 1282 |
return
|
| 1283 |
|
| 1284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1285 |
samples = random.sample(all_samples, min(n, len(all_samples)))
|
| 1286 |
model.eval()
|
| 1287 |
|
|
@@ -1296,7 +1314,7 @@ def run_benchmark(model, tokenizer, device, dataset_path: str = None, n: int = 2
|
|
| 1296 |
prompt = f"{inp} <cot>"
|
| 1297 |
|
| 1298 |
full = generate_text(model, tokenizer, prompt=prompt, max_new_tokens=150,
|
| 1299 |
-
temperature=0.
|
| 1300 |
raw = full[len(prompt):].strip()
|
| 1301 |
_, answer = _extract_thinking(raw)
|
| 1302 |
answer_lower = answer.lower()
|
|
@@ -1448,7 +1466,20 @@ def main():
|
|
| 1448 |
save_fp16 = not args.save_fp32
|
| 1449 |
use_cot_training = not args.no_cot
|
| 1450 |
|
| 1451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1452 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 1453 |
print(f"\nDevice: {device}")
|
| 1454 |
if torch.cuda.is_available():
|
|
|
|
| 50 |
num_layers: int = 12
|
| 51 |
num_attention_heads: int = 6
|
| 52 |
num_key_value_heads: int = 2 # GQA
|
| 53 |
+
# Stored as a plain int field — NEVER a @property — so pickle round-trips work.
|
| 54 |
+
# 0 = unset (load_model will fill it from checkpoint weight shapes).
|
| 55 |
+
# New training always passes this explicitly from len(tokenizer) / hidden_size.
|
| 56 |
+
intermediate_size: int = 0
|
|
|
|
| 57 |
max_position_embeddings: int = 2048
|
| 58 |
rms_norm_eps: float = 1e-6
|
| 59 |
rope_theta: float = 10000.0
|
|
|
|
| 67 |
label_smoothing: float = 0.1
|
| 68 |
|
| 69 |
def __post_init__(self):
|
| 70 |
+
# Set intermediate_size only when not already provided
|
| 71 |
+
if self.intermediate_size <= 0:
|
|
|
|
|
|
|
| 72 |
self.intermediate_size = self.hidden_size * 3
|
| 73 |
assert self.hidden_size % self.num_attention_heads == 0, \
|
| 74 |
f"hidden_size {self.hidden_size} not divisible by num_heads {self.num_attention_heads}"
|
|
|
|
| 291 |
class SwiGLUMLP(nn.Module):
|
| 292 |
def __init__(self, config: ModelConfig):
|
| 293 |
super().__init__()
|
| 294 |
+
self.hidden_size = config.hidden_size
|
| 295 |
+
# Read intermediate_size defensively: if somehow 0 or negative (e.g. old
|
| 296 |
+
# unpickled config that missed __post_init__), fall back to hidden * 3.
|
| 297 |
+
inter = getattr(config, 'intermediate_size', 0)
|
| 298 |
+
if not isinstance(inter, int) or inter <= 0:
|
| 299 |
+
inter = self.hidden_size * 3
|
| 300 |
+
self.intermediate_size = inter
|
| 301 |
+
self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
| 302 |
+
self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
|
| 303 |
+
self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
|
| 304 |
|
| 305 |
def forward(self, x):
|
| 306 |
return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
|
|
|
|
| 1070 |
"""
|
| 1071 |
model.eval()
|
| 1072 |
|
| 1073 |
+
# Reseed from OS entropy so repeated calls with the same prompt diverge.
|
| 1074 |
+
# This is the core fix: torch.multinomial outcome depends on torch RNG state,
|
| 1075 |
+
# which was frozen to seed=42 at startup. Each call now starts from a unique state.
|
| 1076 |
+
import os as _os
|
| 1077 |
+
_entropy = int.from_bytes(_os.urandom(4), 'little')
|
| 1078 |
+
torch.manual_seed(_entropy)
|
| 1079 |
+
if torch.cuda.is_available():
|
| 1080 |
+
torch.cuda.manual_seed_all(_entropy)
|
| 1081 |
+
|
| 1082 |
eos_id = tokenizer.eos_token_id or tokenizer.sep_token_id or 2
|
| 1083 |
pad_id = tokenizer.pad_token_id or 0
|
| 1084 |
|
|
|
|
| 1292 |
print("No valid samples.")
|
| 1293 |
return
|
| 1294 |
|
| 1295 |
+
# Time-based seed: different sample selection AND different generation each run
|
| 1296 |
+
import time
|
| 1297 |
+
live_seed = int(time.time() * 1000) % (2**31)
|
| 1298 |
+
random.seed(live_seed)
|
| 1299 |
+
torch.manual_seed(live_seed)
|
| 1300 |
+
if torch.cuda.is_available():
|
| 1301 |
+
torch.cuda.manual_seed_all(live_seed)
|
| 1302 |
+
|
| 1303 |
samples = random.sample(all_samples, min(n, len(all_samples)))
|
| 1304 |
model.eval()
|
| 1305 |
|
|
|
|
| 1314 |
prompt = f"{inp} <cot>"
|
| 1315 |
|
| 1316 |
full = generate_text(model, tokenizer, prompt=prompt, max_new_tokens=150,
|
| 1317 |
+
temperature=0.7, top_k=40, top_p=0.92, device=device)
|
| 1318 |
raw = full[len(prompt):].strip()
|
| 1319 |
_, answer = _extract_thinking(raw)
|
| 1320 |
answer_lower = answer.lower()
|
|
|
|
| 1466 |
save_fp16 = not args.save_fp32
|
| 1467 |
use_cot_training = not args.no_cot
|
| 1468 |
|
| 1469 |
+
# Only fix the seed for training (reproducibility).
|
| 1470 |
+
# Chat and benchmark must NOT be seeded — identical seeds produce identical
|
| 1471 |
+
# outputs every run, making the model feel like a lookup table.
|
| 1472 |
+
if args.train or args.finetune or args.continue_train:
|
| 1473 |
+
set_seed(args.seed)
|
| 1474 |
+
else:
|
| 1475 |
+
# Use a time-based seed so every run is different
|
| 1476 |
+
import time
|
| 1477 |
+
live_seed = int(time.time() * 1000) % (2**31)
|
| 1478 |
+
random.seed(live_seed)
|
| 1479 |
+
np.random.seed(live_seed)
|
| 1480 |
+
torch.manual_seed(live_seed)
|
| 1481 |
+
if torch.cuda.is_available():
|
| 1482 |
+
torch.cuda.manual_seed_all(live_seed)
|
| 1483 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 1484 |
print(f"\nDevice: {device}")
|
| 1485 |
if torch.cuda.is_available():
|