*** Begin Patch *** Update hyperparameters and dataset sources for a 2h wall-clock budget @@ -NUM_EPOCHS = 3 +MAX_TRAIN_HOURS = 2.0 +TIME_SAFETY_MARGIN_S = 180 # laisse 3 min pour save proprement + +NUM_EPOCHS = 50 # le vrai garde-fou devient le temps mur LEARNING_RATE = 3e-4 MIN_LR = 3e-5 WEIGHT_DECAY = 0.1 WARMUP_STEPS = 500 @@ -# Objectif temps : -# - depuis zéro : ~70_000 steps ≈ ~10–12 h selon le débit réel -# - depuis un checkpoint déjà vers ~12k steps : ~85_000 steps ≈ ~10–12 h restantes -MAX_STEPS = 85_000 -EVAL_EVERY = 1_000 -SAVE_EVERY = 2_000 +# Objectif temps : run court, max 2h murales. +# MAX_STEPS reste une sécurité secondaire, l'arrêt principal est piloté par l'horloge. +MAX_STEPS = 20_000 +EVAL_EVERY = 750 +SAVE_EVERY = 1_000 +EVAL_MAX_BATCHES = 64 @@ -USE_COMPILE = True +USE_COMPILE = False # pour un run court, évite la latence de compile @@ -TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000 -TOKENIZER_CHAR_LIMIT = 2_000 -TEXT_CHAR_LIMIT = 4_000 +TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 4_000 +TOKENIZER_CHAR_LIMIT = 1_500 +TEXT_CHAR_LIMIT = 3_000 @@ -DATA_SOURCES = [ - # 1. FineWeb (anglais – très haute qualité) - { - "name": "HuggingFaceFW/fineweb", - "config": None, - "split": "train", - "text_column": "text", - "dev_docs": 10_000, - "train_docs_per_epoch": 1_200_000, # ~48 GB sur 10 epochs - "language_filter": None, - }, - # 2. C4 multilingual → français - { - "name": "allenai/c4", - "config": "multilingual", - "split": "train", - "text_column": "text", - "dev_docs": 5_000, - "train_docs_per_epoch": 400_000, # ~16 GB sur 10 epochs - "language_filter": "fr", - }, - # 3. C4 multilingual → arabe - { - "name": "allenai/c4", - "config": "multilingual", - "split": "train", - "text_column": "text", - "dev_docs": 5_000, - "train_docs_per_epoch": 300_000, # ~12 GB sur 10 epochs - "language_filter": "ar", - }, -] +DATA_SOURCES = [ + # Web de bonne qualité, échantillon public et rapide à streamer + { + "name": "HuggingFaceFW/fineweb-edu", + "config": "sample-10BT", + "split": "train", + "text_column": "text", + "dev_docs": 1_000, + "train_docs_per_epoch": 60_000, + "language_filter": "en", + }, + # Wikipedia EN / FR / AR + { + "name": "wikimedia/wikipedia", + "config": "20231101.en", + "split": "train", + "text_column": "text", + "dev_docs": 500, + "train_docs_per_epoch": 12_000, + "language_filter": None, + }, + { + "name": "wikimedia/wikipedia", + "config": "20231101.fr", + "split": "train", + "text_column": "text", + "dev_docs": 500, + "train_docs_per_epoch": 8_000, + "language_filter": None, + }, + { + "name": "wikimedia/wikipedia", + "config": "20231101.ar", + "split": "train", + "text_column": "text", + "dev_docs": 500, + "train_docs_per_epoch": 8_000, + "language_filter": None, + }, + # C4 multilingue pour diversifier le web FR / AR + { + "name": "allenai/c4", + "config": "multilingual", + "split": "train", + "text_column": "text", + "dev_docs": 1_000, + "train_docs_per_epoch": 18_000, + "language_filter": "fr", + }, + { + "name": "allenai/c4", + "config": "multilingual", + "split": "train", + "text_column": "text", + "dev_docs": 1_000, + "train_docs_per_epoch": 14_000, + "language_filter": "ar", + }, + # Optionnel si tu veux un mélange texte+code : + # { + # "name": "codeparrot/github-code", + # "config": None, + # "split": "train", + # "text_column": "code", + # "dev_docs": 300, + # "train_docs_per_epoch": 5_000, + # "language_filter": None, + # }, +] @@ def safe_str(x) -> str: return x if isinstance(x, str) else ("" if x is None else str(x)) + +def seconds_remaining(start_time: float) -> float: + return max(0.0, MAX_TRAIN_HOURS * 3600 - (time.monotonic() - start_time)) + +def time_budget_exceeded(start_time: float) -> bool: + return seconds_remaining(start_time) <= TIME_SAFETY_MARGIN_S @@ def main() -> None: + job_start = time.monotonic() ddp_device = init_distributed() set_seed(SEED + get_rank()) @@ if is_main(): print("=" * 72) print(" GPT ~1B | H100 80 Go | QLoRA + BF16 + TF32 | MAX 100 GB (public)") print("=" * 72) + print(f"Budget : {MAX_TRAIN_HOURS:.2f} h max | marge save: {TIME_SAFETY_MARGIN_S//60} min") print(f"Device : {device} | World: {get_world_size()} GPU(s)") @@ - val_loss = evaluate(model, eval_loader, device) + if seconds_remaining(job_start) > (TIME_SAFETY_MARGIN_S + 60): + val_loss = evaluate(model, eval_loader, device, max_batches=EVAL_MAX_BATCHES) + else: + val_loss = None + print(f"[eval] step {global_step:5d} | skip (budget temps trop court)") - print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}") - if val_loss < best_eval: + if val_loss is not None: + print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}") + if val_loss is not None and val_loss < best_eval: best_eval = val_loss save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE) print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}") @@ for micro_step, batch in enumerate(train_loader): + if time_budget_exceeded(job_start): + if is_main(): + print(f"✓ Budget temps atteint (~{MAX_TRAIN_HOURS:.2f} h) — arrêt propre") + stop_training = True + break + inp = batch["input_ids"].to(device, non_blocking=True) lbl = batch["labels"].to(device, non_blocking=True) *** End Patch