FirstChat / patch_train_nlp_h100_max2h_v8.diff

Add files using upload-large-folder tool

59dc998 verified about 1 month ago

6.4 kB

	*** Begin Patch
	*** Update hyperparameters and dataset sources for a 2h wall-clock budget
	@@
	-NUM_EPOCHS = 3
	+MAX_TRAIN_HOURS = 2.0
	+TIME_SAFETY_MARGIN_S = 180 # laisse 3 min pour save proprement
	+
	+NUM_EPOCHS = 50 # le vrai garde-fou devient le temps mur
	LEARNING_RATE = 3e-4
	MIN_LR = 3e-5
	WEIGHT_DECAY = 0.1
	WARMUP_STEPS = 500
	@@
	-# Objectif temps :
	-# - depuis zéro : ~70_000 steps ≈ ~10–12 h selon le débit réel
	-# - depuis un checkpoint déjà vers ~12k steps : ~85_000 steps ≈ ~10–12 h restantes
	-MAX_STEPS = 85_000
	-EVAL_EVERY = 1_000
	-SAVE_EVERY = 2_000
	+# Objectif temps : run court, max 2h murales.
	+# MAX_STEPS reste une sécurité secondaire, l'arrêt principal est piloté par l'horloge.
	+MAX_STEPS = 20_000
	+EVAL_EVERY = 750
	+SAVE_EVERY = 1_000
	+EVAL_MAX_BATCHES = 64
	@@
	-USE_COMPILE = True
	+USE_COMPILE = False # pour un run court, évite la latence de compile
	@@
	-TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000
	-TOKENIZER_CHAR_LIMIT = 2_000
	-TEXT_CHAR_LIMIT = 4_000
	+TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 4_000
	+TOKENIZER_CHAR_LIMIT = 1_500
	+TEXT_CHAR_LIMIT = 3_000
	@@
	-DATA_SOURCES = [
	- # 1. FineWeb (anglais – très haute qualité)
	- {
	- "name": "HuggingFaceFW/fineweb",
	- "config": None,
	- "split": "train",
	- "text_column": "text",
	- "dev_docs": 10_000,
	- "train_docs_per_epoch": 1_200_000, # ~48 GB sur 10 epochs
	- "language_filter": None,
	- },
	- # 2. C4 multilingual → français
	- {
	- "name": "allenai/c4",
	- "config": "multilingual",
	- "split": "train",
	- "text_column": "text",
	- "dev_docs": 5_000,
	- "train_docs_per_epoch": 400_000, # ~16 GB sur 10 epochs
	- "language_filter": "fr",
	- },
	- # 3. C4 multilingual → arabe
	- {
	- "name": "allenai/c4",
	- "config": "multilingual",
	- "split": "train",
	- "text_column": "text",
	- "dev_docs": 5_000,
	- "train_docs_per_epoch": 300_000, # ~12 GB sur 10 epochs
	- "language_filter": "ar",
	- },
	-]
	+DATA_SOURCES = [
	+ # Web de bonne qualité, échantillon public et rapide à streamer
	+ {
	+ "name": "HuggingFaceFW/fineweb-edu",
	+ "config": "sample-10BT",
	+ "split": "train",
	+ "text_column": "text",
	+ "dev_docs": 1_000,
	+ "train_docs_per_epoch": 60_000,
	+ "language_filter": "en",
	+ },
	+ # Wikipedia EN / FR / AR
	+ {
	+ "name": "wikimedia/wikipedia",
	+ "config": "20231101.en",
	+ "split": "train",
	+ "text_column": "text",
	+ "dev_docs": 500,
	+ "train_docs_per_epoch": 12_000,
	+ "language_filter": None,
	+ },
	+ {
	+ "name": "wikimedia/wikipedia",
	+ "config": "20231101.fr",
	+ "split": "train",
	+ "text_column": "text",
	+ "dev_docs": 500,
	+ "train_docs_per_epoch": 8_000,
	+ "language_filter": None,
	+ },
	+ {
	+ "name": "wikimedia/wikipedia",
	+ "config": "20231101.ar",
	+ "split": "train",
	+ "text_column": "text",
	+ "dev_docs": 500,
	+ "train_docs_per_epoch": 8_000,
	+ "language_filter": None,
	+ },
	+ # C4 multilingue pour diversifier le web FR / AR
	+ {
	+ "name": "allenai/c4",
	+ "config": "multilingual",
	+ "split": "train",
	+ "text_column": "text",
	+ "dev_docs": 1_000,
	+ "train_docs_per_epoch": 18_000,
	+ "language_filter": "fr",
	+ },
	+ {
	+ "name": "allenai/c4",
	+ "config": "multilingual",
	+ "split": "train",
	+ "text_column": "text",
	+ "dev_docs": 1_000,
	+ "train_docs_per_epoch": 14_000,
	+ "language_filter": "ar",
	+ },
	+ # Optionnel si tu veux un mélange texte+code :
	+ # {
	+ # "name": "codeparrot/github-code",
	+ # "config": None,
	+ # "split": "train",
	+ # "text_column": "code",
	+ # "dev_docs": 300,
	+ # "train_docs_per_epoch": 5_000,
	+ # "language_filter": None,
	+ # },
	+]
	@@
	def safe_str(x) -> str:
	return x if isinstance(x, str) else ("" if x is None else str(x))
	+
	+def seconds_remaining(start_time: float) -> float:
	+ return max(0.0, MAX_TRAIN_HOURS * 3600 - (time.monotonic() - start_time))
	+
	+def time_budget_exceeded(start_time: float) -> bool:
	+ return seconds_remaining(start_time) <= TIME_SAFETY_MARGIN_S
	@@
	def main() -> None:
	+ job_start = time.monotonic()
	ddp_device = init_distributed()
	set_seed(SEED + get_rank())
	@@
	if is_main():
	print("=" * 72)
	print(" GPT ~1B \| H100 80 Go \| QLoRA + BF16 + TF32 \| MAX 100 GB (public)")
	print("=" * 72)
	+ print(f"Budget : {MAX_TRAIN_HOURS:.2f} h max \| marge save: {TIME_SAFETY_MARGIN_S//60} min")
	print(f"Device : {device} \| World: {get_world_size()} GPU(s)")
	@@
	- val_loss = evaluate(model, eval_loader, device)
	+ if seconds_remaining(job_start) > (TIME_SAFETY_MARGIN_S + 60):
	+ val_loss = evaluate(model, eval_loader, device, max_batches=EVAL_MAX_BATCHES)
	+ else:
	+ val_loss = None
	+ print(f"[eval] step {global_step:5d} \| skip (budget temps trop court)")
	- print(f"[eval] step {global_step:5d} \| val_loss={val_loss:.4f}")
	- if val_loss < best_eval:
	+ if val_loss is not None:
	+ print(f"[eval] step {global_step:5d} \| val_loss={val_loss:.4f}")
	+ if val_loss is not None and val_loss < best_eval:
	best_eval = val_loss
	save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE)
	print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}")
	@@
	for micro_step, batch in enumerate(train_loader):
	+ if time_budget_exceeded(job_start):
	+ if is_main():
	+ print(f"✓ Budget temps atteint (~{MAX_TRAIN_HOURS:.2f} h) — arrêt propre")
	+ stop_training = True
	+ break
	+
	inp = batch["input_ids"].to(device, non_blocking=True)
	lbl = batch["labels"].to(device, non_blocking=True)
	*** End Patch