FirstChat / patch_train_nlp_h100_max2h_v8.diff
Medyassino's picture
Add files using upload-large-folder tool
59dc998 verified
*** Begin Patch
*** Update hyperparameters and dataset sources for a 2h wall-clock budget
@@
-NUM_EPOCHS = 3
+MAX_TRAIN_HOURS = 2.0
+TIME_SAFETY_MARGIN_S = 180 # laisse 3 min pour save proprement
+
+NUM_EPOCHS = 50 # le vrai garde-fou devient le temps mur
LEARNING_RATE = 3e-4
MIN_LR = 3e-5
WEIGHT_DECAY = 0.1
WARMUP_STEPS = 500
@@
-# Objectif temps :
-# - depuis zéro : ~70_000 steps ≈ ~10–12 h selon le débit réel
-# - depuis un checkpoint déjà vers ~12k steps : ~85_000 steps ≈ ~10–12 h restantes
-MAX_STEPS = 85_000
-EVAL_EVERY = 1_000
-SAVE_EVERY = 2_000
+# Objectif temps : run court, max 2h murales.
+# MAX_STEPS reste une sécurité secondaire, l'arrêt principal est piloté par l'horloge.
+MAX_STEPS = 20_000
+EVAL_EVERY = 750
+SAVE_EVERY = 1_000
+EVAL_MAX_BATCHES = 64
@@
-USE_COMPILE = True
+USE_COMPILE = False # pour un run court, évite la latence de compile
@@
-TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000
-TOKENIZER_CHAR_LIMIT = 2_000
-TEXT_CHAR_LIMIT = 4_000
+TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 4_000
+TOKENIZER_CHAR_LIMIT = 1_500
+TEXT_CHAR_LIMIT = 3_000
@@
-DATA_SOURCES = [
- # 1. FineWeb (anglais – très haute qualité)
- {
- "name": "HuggingFaceFW/fineweb",
- "config": None,
- "split": "train",
- "text_column": "text",
- "dev_docs": 10_000,
- "train_docs_per_epoch": 1_200_000, # ~48 GB sur 10 epochs
- "language_filter": None,
- },
- # 2. C4 multilingual → français
- {
- "name": "allenai/c4",
- "config": "multilingual",
- "split": "train",
- "text_column": "text",
- "dev_docs": 5_000,
- "train_docs_per_epoch": 400_000, # ~16 GB sur 10 epochs
- "language_filter": "fr",
- },
- # 3. C4 multilingual → arabe
- {
- "name": "allenai/c4",
- "config": "multilingual",
- "split": "train",
- "text_column": "text",
- "dev_docs": 5_000,
- "train_docs_per_epoch": 300_000, # ~12 GB sur 10 epochs
- "language_filter": "ar",
- },
-]
+DATA_SOURCES = [
+ # Web de bonne qualité, échantillon public et rapide à streamer
+ {
+ "name": "HuggingFaceFW/fineweb-edu",
+ "config": "sample-10BT",
+ "split": "train",
+ "text_column": "text",
+ "dev_docs": 1_000,
+ "train_docs_per_epoch": 60_000,
+ "language_filter": "en",
+ },
+ # Wikipedia EN / FR / AR
+ {
+ "name": "wikimedia/wikipedia",
+ "config": "20231101.en",
+ "split": "train",
+ "text_column": "text",
+ "dev_docs": 500,
+ "train_docs_per_epoch": 12_000,
+ "language_filter": None,
+ },
+ {
+ "name": "wikimedia/wikipedia",
+ "config": "20231101.fr",
+ "split": "train",
+ "text_column": "text",
+ "dev_docs": 500,
+ "train_docs_per_epoch": 8_000,
+ "language_filter": None,
+ },
+ {
+ "name": "wikimedia/wikipedia",
+ "config": "20231101.ar",
+ "split": "train",
+ "text_column": "text",
+ "dev_docs": 500,
+ "train_docs_per_epoch": 8_000,
+ "language_filter": None,
+ },
+ # C4 multilingue pour diversifier le web FR / AR
+ {
+ "name": "allenai/c4",
+ "config": "multilingual",
+ "split": "train",
+ "text_column": "text",
+ "dev_docs": 1_000,
+ "train_docs_per_epoch": 18_000,
+ "language_filter": "fr",
+ },
+ {
+ "name": "allenai/c4",
+ "config": "multilingual",
+ "split": "train",
+ "text_column": "text",
+ "dev_docs": 1_000,
+ "train_docs_per_epoch": 14_000,
+ "language_filter": "ar",
+ },
+ # Optionnel si tu veux un mélange texte+code :
+ # {
+ # "name": "codeparrot/github-code",
+ # "config": None,
+ # "split": "train",
+ # "text_column": "code",
+ # "dev_docs": 300,
+ # "train_docs_per_epoch": 5_000,
+ # "language_filter": None,
+ # },
+]
@@
def safe_str(x) -> str:
return x if isinstance(x, str) else ("" if x is None else str(x))
+
+def seconds_remaining(start_time: float) -> float:
+ return max(0.0, MAX_TRAIN_HOURS * 3600 - (time.monotonic() - start_time))
+
+def time_budget_exceeded(start_time: float) -> bool:
+ return seconds_remaining(start_time) <= TIME_SAFETY_MARGIN_S
@@
def main() -> None:
+ job_start = time.monotonic()
ddp_device = init_distributed()
set_seed(SEED + get_rank())
@@
if is_main():
print("=" * 72)
print(" GPT ~1B | H100 80 Go | QLoRA + BF16 + TF32 | MAX 100 GB (public)")
print("=" * 72)
+ print(f"Budget : {MAX_TRAIN_HOURS:.2f} h max | marge save: {TIME_SAFETY_MARGIN_S//60} min")
print(f"Device : {device} | World: {get_world_size()} GPU(s)")
@@
- val_loss = evaluate(model, eval_loader, device)
+ if seconds_remaining(job_start) > (TIME_SAFETY_MARGIN_S + 60):
+ val_loss = evaluate(model, eval_loader, device, max_batches=EVAL_MAX_BATCHES)
+ else:
+ val_loss = None
+ print(f"[eval] step {global_step:5d} | skip (budget temps trop court)")
- print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}")
- if val_loss < best_eval:
+ if val_loss is not None:
+ print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}")
+ if val_loss is not None and val_loss < best_eval:
best_eval = val_loss
save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE)
print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}")
@@
for micro_step, batch in enumerate(train_loader):
+ if time_budget_exceeded(job_start):
+ if is_main():
+ print(f"✓ Budget temps atteint (~{MAX_TRAIN_HOURS:.2f} h) — arrêt propre")
+ stop_training = True
+ break
+
inp = batch["input_ids"].to(device, non_blocking=True)
lbl = batch["labels"].to(device, non_blocking=True)
*** End Patch