| |
| |
| @@ |
| -NUM_EPOCHS = 3 |
| +MAX_TRAIN_HOURS = 2.0 |
| +TIME_SAFETY_MARGIN_S = 180 # laisse 3 min pour save proprement |
| + |
| +NUM_EPOCHS = 50 # le vrai garde-fou devient le temps mur |
| LEARNING_RATE = 3e-4 |
| MIN_LR = 3e-5 |
| WEIGHT_DECAY = 0.1 |
| WARMUP_STEPS = 500 |
| @@ |
| -# Objectif temps : |
| -# - depuis zéro : ~70_000 steps ≈ ~10–12 h selon le débit réel |
| -# - depuis un checkpoint déjà vers ~12k steps : ~85_000 steps ≈ ~10–12 h restantes |
| -MAX_STEPS = 85_000 |
| -EVAL_EVERY = 1_000 |
| -SAVE_EVERY = 2_000 |
| +# Objectif temps : run court, max 2h murales. |
| +# MAX_STEPS reste une sécurité secondaire, l'arrêt principal est piloté par l'horloge. |
| +MAX_STEPS = 20_000 |
| +EVAL_EVERY = 750 |
| +SAVE_EVERY = 1_000 |
| +EVAL_MAX_BATCHES = 64 |
| @@ |
| -USE_COMPILE = True |
| +USE_COMPILE = False # pour un run court, évite la latence de compile |
| @@ |
| -TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000 |
| -TOKENIZER_CHAR_LIMIT = 2_000 |
| -TEXT_CHAR_LIMIT = 4_000 |
| +TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 4_000 |
| +TOKENIZER_CHAR_LIMIT = 1_500 |
| +TEXT_CHAR_LIMIT = 3_000 |
| @@ |
| -DATA_SOURCES = [ |
| - # 1. FineWeb (anglais – très haute qualité) |
| - { |
| - "name": "HuggingFaceFW/fineweb", |
| - "config": None, |
| - "split": "train", |
| - "text_column": "text", |
| - "dev_docs": 10_000, |
| - "train_docs_per_epoch": 1_200_000, # ~48 GB sur 10 epochs |
| - "language_filter": None, |
| - }, |
| - # 2. C4 multilingual → français |
| - { |
| - "name": "allenai/c4", |
| - "config": "multilingual", |
| - "split": "train", |
| - "text_column": "text", |
| - "dev_docs": 5_000, |
| - "train_docs_per_epoch": 400_000, # ~16 GB sur 10 epochs |
| - "language_filter": "fr", |
| - }, |
| - # 3. C4 multilingual → arabe |
| - { |
| - "name": "allenai/c4", |
| - "config": "multilingual", |
| - "split": "train", |
| - "text_column": "text", |
| - "dev_docs": 5_000, |
| - "train_docs_per_epoch": 300_000, # ~12 GB sur 10 epochs |
| - "language_filter": "ar", |
| - }, |
| -] |
| +DATA_SOURCES = [ |
| + # Web de bonne qualité, échantillon public et rapide à streamer |
| + { |
| + "name": "HuggingFaceFW/fineweb-edu", |
| + "config": "sample-10BT", |
| + "split": "train", |
| + "text_column": "text", |
| + "dev_docs": 1_000, |
| + "train_docs_per_epoch": 60_000, |
| + "language_filter": "en", |
| + }, |
| + # Wikipedia EN / FR / AR |
| + { |
| + "name": "wikimedia/wikipedia", |
| + "config": "20231101.en", |
| + "split": "train", |
| + "text_column": "text", |
| + "dev_docs": 500, |
| + "train_docs_per_epoch": 12_000, |
| + "language_filter": None, |
| + }, |
| + { |
| + "name": "wikimedia/wikipedia", |
| + "config": "20231101.fr", |
| + "split": "train", |
| + "text_column": "text", |
| + "dev_docs": 500, |
| + "train_docs_per_epoch": 8_000, |
| + "language_filter": None, |
| + }, |
| + { |
| + "name": "wikimedia/wikipedia", |
| + "config": "20231101.ar", |
| + "split": "train", |
| + "text_column": "text", |
| + "dev_docs": 500, |
| + "train_docs_per_epoch": 8_000, |
| + "language_filter": None, |
| + }, |
| + # C4 multilingue pour diversifier le web FR / AR |
| + { |
| + "name": "allenai/c4", |
| + "config": "multilingual", |
| + "split": "train", |
| + "text_column": "text", |
| + "dev_docs": 1_000, |
| + "train_docs_per_epoch": 18_000, |
| + "language_filter": "fr", |
| + }, |
| + { |
| + "name": "allenai/c4", |
| + "config": "multilingual", |
| + "split": "train", |
| + "text_column": "text", |
| + "dev_docs": 1_000, |
| + "train_docs_per_epoch": 14_000, |
| + "language_filter": "ar", |
| + }, |
| + # Optionnel si tu veux un mélange texte+code : |
| + # { |
| + # "name": "codeparrot/github-code", |
| + # "config": None, |
| + # "split": "train", |
| + # "text_column": "code", |
| + # "dev_docs": 300, |
| + # "train_docs_per_epoch": 5_000, |
| + # "language_filter": None, |
| + # }, |
| +] |
| @@ |
| def safe_str(x) -> str: |
| return x if isinstance(x, str) else ("" if x is None else str(x)) |
| + |
| +def seconds_remaining(start_time: float) -> float: |
| + return max(0.0, MAX_TRAIN_HOURS * 3600 - (time.monotonic() - start_time)) |
| + |
| +def time_budget_exceeded(start_time: float) -> bool: |
| + return seconds_remaining(start_time) <= TIME_SAFETY_MARGIN_S |
| @@ |
| def main() -> None: |
| + job_start = time.monotonic() |
| ddp_device = init_distributed() |
| set_seed(SEED + get_rank()) |
| @@ |
| if is_main(): |
| print("=" * 72) |
| print(" GPT ~1B | H100 80 Go | QLoRA + BF16 + TF32 | MAX 100 GB (public)") |
| print("=" * 72) |
| + print(f"Budget : {MAX_TRAIN_HOURS:.2f} h max | marge save: {TIME_SAFETY_MARGIN_S//60} min") |
| print(f"Device : {device} | World: {get_world_size()} GPU(s)") |
| @@ |
| - val_loss = evaluate(model, eval_loader, device) |
| + if seconds_remaining(job_start) > (TIME_SAFETY_MARGIN_S + 60): |
| + val_loss = evaluate(model, eval_loader, device, max_batches=EVAL_MAX_BATCHES) |
| + else: |
| + val_loss = None |
| + print(f"[eval] step {global_step:5d} | skip (budget temps trop court)") |
| - print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}") |
| - if val_loss < best_eval: |
| + if val_loss is not None: |
| + print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}") |
| + if val_loss is not None and val_loss < best_eval: |
| best_eval = val_loss |
| save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE) |
| print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}") |
| @@ |
| for micro_step, batch in enumerate(train_loader): |
| + if time_budget_exceeded(job_start): |
| + if is_main(): |
| + print(f"✓ Budget temps atteint (~{MAX_TRAIN_HOURS:.2f} h) — arrêt propre") |
| + stop_training = True |
| + break |
| + |
| inp = batch["input_ids"].to(device, non_blocking=True) |
| lbl = batch["labels"].to(device, non_blocking=True) |
| |
|
|