File size: 6,396 Bytes

59dc998

*** Begin Patch
*** Update hyperparameters and dataset sources for a 2h wall-clock budget
@@
-NUM_EPOCHS       = 3
+MAX_TRAIN_HOURS      = 2.0
+TIME_SAFETY_MARGIN_S = 180   # laisse 3 min pour save proprement
+
+NUM_EPOCHS       = 50        # le vrai garde-fou devient le temps mur
 LEARNING_RATE    = 3e-4
 MIN_LR           = 3e-5
 WEIGHT_DECAY     = 0.1
 WARMUP_STEPS     = 500
@@
-# Objectif temps :
-# - depuis zéro : ~70_000 steps ≈ ~10–12 h selon le débit réel
-# - depuis un checkpoint déjà vers ~12k steps : ~85_000 steps ≈ ~10–12 h restantes
-MAX_STEPS        = 85_000
-EVAL_EVERY       = 1_000
-SAVE_EVERY       = 2_000
+# Objectif temps : run court, max 2h murales.
+# MAX_STEPS reste une sécurité secondaire, l'arrêt principal est piloté par l'horloge.
+MAX_STEPS        = 20_000
+EVAL_EVERY       = 750
+SAVE_EVERY       = 1_000
+EVAL_MAX_BATCHES = 64
@@
-USE_COMPILE       = True
+USE_COMPILE       = False   # pour un run court, évite la latence de compile
@@
-TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000
-TOKENIZER_CHAR_LIMIT             = 2_000
-TEXT_CHAR_LIMIT                  = 4_000
+TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 4_000
+TOKENIZER_CHAR_LIMIT             = 1_500
+TEXT_CHAR_LIMIT                  = 3_000
@@
-DATA_SOURCES = [
-    # 1. FineWeb (anglais – très haute qualité)
-    {
-        "name": "HuggingFaceFW/fineweb",
-        "config": None,
-        "split": "train",
-        "text_column": "text",
-        "dev_docs": 10_000,
-        "train_docs_per_epoch": 1_200_000,   # ~48 GB sur 10 epochs
-        "language_filter": None,
-    },
-    # 2. C4 multilingual → français
-    {
-        "name": "allenai/c4",
-        "config": "multilingual",
-        "split": "train",
-        "text_column": "text",
-        "dev_docs": 5_000,
-        "train_docs_per_epoch": 400_000,     # ~16 GB sur 10 epochs
-        "language_filter": "fr",
-    },
-    # 3. C4 multilingual → arabe
-    {
-        "name": "allenai/c4",
-        "config": "multilingual",
-        "split": "train",
-        "text_column": "text",
-        "dev_docs": 5_000,
-        "train_docs_per_epoch": 300_000,     # ~12 GB sur 10 epochs
-        "language_filter": "ar",
-    },
-]
+DATA_SOURCES = [
+    # Web de bonne qualité, échantillon public et rapide à streamer
+    {
+        "name": "HuggingFaceFW/fineweb-edu",
+        "config": "sample-10BT",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 1_000,
+        "train_docs_per_epoch": 60_000,
+        "language_filter": "en",
+    },
+    # Wikipedia EN / FR / AR
+    {
+        "name": "wikimedia/wikipedia",
+        "config": "20231101.en",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 500,
+        "train_docs_per_epoch": 12_000,
+        "language_filter": None,
+    },
+    {
+        "name": "wikimedia/wikipedia",
+        "config": "20231101.fr",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 500,
+        "train_docs_per_epoch": 8_000,
+        "language_filter": None,
+    },
+    {
+        "name": "wikimedia/wikipedia",
+        "config": "20231101.ar",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 500,
+        "train_docs_per_epoch": 8_000,
+        "language_filter": None,
+    },
+    # C4 multilingue pour diversifier le web FR / AR
+    {
+        "name": "allenai/c4",
+        "config": "multilingual",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 1_000,
+        "train_docs_per_epoch": 18_000,
+        "language_filter": "fr",
+    },
+    {
+        "name": "allenai/c4",
+        "config": "multilingual",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 1_000,
+        "train_docs_per_epoch": 14_000,
+        "language_filter": "ar",
+    },
+    # Optionnel si tu veux un mélange texte+code :
+    # {
+    #     "name": "codeparrot/github-code",
+    #     "config": None,
+    #     "split": "train",
+    #     "text_column": "code",
+    #     "dev_docs": 300,
+    #     "train_docs_per_epoch": 5_000,
+    #     "language_filter": None,
+    # },
+]
@@
 def safe_str(x) -> str:
     return x if isinstance(x, str) else ("" if x is None else str(x))
+
+def seconds_remaining(start_time: float) -> float:
+    return max(0.0, MAX_TRAIN_HOURS * 3600 - (time.monotonic() - start_time))
+
+def time_budget_exceeded(start_time: float) -> bool:
+    return seconds_remaining(start_time) <= TIME_SAFETY_MARGIN_S
@@
 def main() -> None:
+    job_start = time.monotonic()
     ddp_device = init_distributed()
     set_seed(SEED + get_rank())
@@
     if is_main():
         print("=" * 72)
         print(" GPT ~1B | H100 80 Go | QLoRA + BF16 + TF32 | MAX 100 GB (public)")
         print("=" * 72)
+        print(f"Budget  : {MAX_TRAIN_HOURS:.2f} h max | marge save: {TIME_SAFETY_MARGIN_S//60} min")
         print(f"Device  : {device} | World: {get_world_size()} GPU(s)")
@@
-                val_loss = evaluate(model, eval_loader, device)
+                if seconds_remaining(job_start) > (TIME_SAFETY_MARGIN_S + 60):
+                    val_loss = evaluate(model, eval_loader, device, max_batches=EVAL_MAX_BATCHES)
+                else:
+                    val_loss = None
+                    print(f"[eval] step {global_step:5d} | skip (budget temps trop court)")
-                print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}")
-                if val_loss < best_eval:
+                if val_loss is not None:
+                    print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}")
+                if val_loss is not None and val_loss < best_eval:
                     best_eval = val_loss
                     save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE)
                     print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}")
@@
         for micro_step, batch in enumerate(train_loader):
+            if time_budget_exceeded(job_start):
+                if is_main():
+                    print(f"✓ Budget temps atteint (~{MAX_TRAIN_HOURS:.2f} h) — arrêt propre")
+                stop_training = True
+                break
+
             inp = batch["input_ids"].to(device, non_blocking=True)
             lbl = batch["labels"].to(device, non_blocking=True)
*** End Patch