Medyassino commited on Apr 5

Commit

b9049d2

verified ·

1 Parent(s): 8a68ba4

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

__pycache__/train_aramix_h100_full.cpython-313.pyc +0 -0
aramix_h100/config.json +10 -0
aramix_h100/model.pt +3 -0
aramix_h100/model_best.pt +3 -0
aramix_h100/qa_test_report_simple.json +134 -0
aramix_h100/qa_test_report_simple.txt +80 -0
aramix_h100/tokenizer_32k/tokenizer.json +0 -0
aramix_h100/tokenizer_32k/tokenizer_config.json +9 -0
aramix_h100/train_log.jsonl +44 -0
aramix_h100/train_state.pt +3 -0
donner +0 -0
nlp_1b_h100_maxvram/config.json +10 -0
nlp_1b_h100_maxvram/tokenizer_32k/tokenizer.json +0 -0
nlp_1b_h100_maxvram/tokenizer_32k/tokenizer_config.json +9 -0
nlp_1b_h100_opt/config.json +10 -0
nlp_1b_h100_opt/model.pt +3 -0
nlp_1b_h100_opt/model_best.pt +3 -0
nlp_1b_h100_opt/tokenizer_32k/tokenizer.json +0 -0
nlp_1b_h100_opt/tokenizer_32k/tokenizer_config.json +9 -0
nlp_1b_h100_opt/train_state.pt +3 -0
nlp_1b_wiki_en_fr_ar/config.json +10 -0
nlp_1b_wiki_en_fr_ar/model_best.pt +3 -0
nlp_1b_wiki_en_fr_ar/model_epoch_02.pt +3 -0
nlp_1b_wiki_en_fr_ar/tokenizer_32k/tokenizer.json +0 -0
nlp_1b_wiki_en_fr_ar/tokenizer_32k/tokenizer_config.json +9 -0
simple_qa_test_aramix.py +504 -0
simple_qa_test_aramix_v2.py +472 -0
simple_qa_test_aramix_v3.py +583 -0
simple_qa_test_finished_model (1).py +309 -0
simple_qa_test_finished_model.py +309 -0
test.py +428 -0
top_p +0 -0
train.py +859 -0
train2.py +852 -0
train_aramix_h100_full.py +1055 -0
train_nlp_h100_maxvram_v6.py +1046 -0
train_nlp_h100_maxvram_v7.py +805 -0
upload.py +189 -0
wikipedia_ar_h100/config.json +10 -0
wikipedia_ar_h100/tokenizer_32k/tokenizer.json +0 -0
wikipedia_ar_h100/tokenizer_32k/tokenizer_config.json +9 -0
wikipedia_ar_h100/train_state.pt +3 -0
wikipedia_ar_h100_agri_30gb/config.json +10 -0
wikipedia_ar_h100_codealpaca/config.json +10 -0
wikipedia_ar_h100_env_fr_ar_77gb/config.json +10 -0
wikipedia_ar_h100_env_fr_ar_77gb/model_epoch_03.pt +3 -0
wikipedia_ar_h100_multicode/config.json +10 -0
wikipedia_ar_h100_multicode/train_state.pt +3 -0
wikipedia_ar_h100_multicode_10x2000/config.json +10 -0
wikipedia_ar_h100_multicode_10x2000/model_round_06.pt +3 -0

__pycache__/train_aramix_h100_full.cpython-313.pyc ADDED Viewed

Binary file (48.4 kB). View file

aramix_h100/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1024,
+  "n_heads": 16,
+  "n_layers": 24,
+  "d_ff": 4096,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

aramix_h100/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78d6b75f8a8a079e5ba8b85def0c733dd808a075993fcb8fbe21a07350e0d8dc
+size 5225851307

aramix_h100/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1300c7623453b9d5a03fa6b2142ebe0f840676233ec122678429b8f2b7cd2ce
+size 5225876625

aramix_h100/qa_test_report_simple.json ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+  "repo_dir": "/workspace/FirstChat/aramix_h100",
+  "train_script": "/workspace/FirstChat/train_aramix_h100_full.py",
+  "checkpoint": "/workspace/FirstChat/aramix_h100/model_best.pt",
+  "config_path": "/workspace/FirstChat/aramix_h100/config.json",
+  "tokenizer_dir": "/workspace/FirstChat/aramix_h100/tokenizer_32k",
+  "device": "cuda",
+  "total_questions": 10,
+  "avg_overlap_score": 0.1778,
+  "exact_match_rate": 0.0,
+  "avg_latency_s": 1.195,
+  "avg_words_generated": 28.6,
+  "scores_by_category": {
+    "Géographie": 0.25,
+    "Science": 0.05,
+    "Histoire": 0.0,
+    "Mathématiques": 0.0,
+    "Langage": 0.0,
+    "Raisonnement": 1.0,
+    "Dialogue": 0.0
+  },
+  "results": [
+    {
+      "id": 1,
+      "category": "Géographie",
+      "question": "Quelle est la capitale de la France ?",
+      "answer": "Au total, le personnage d'une société financière (dont le but à une institution) est une commission de la musique. Il a également considéré plusieurs sociétés, notamment les p",
+      "reference": "Paris",
+      "latency_s": 1.6,
+      "tokens_generated_approx": 28,
+      "overlap_score": 0.0,
+      "exact_match": false
+    },
+    {
+      "id": 2,
+      "category": "Géographie",
+      "question": "Quel est le plus long fleuve d'Afrique ?",
+      "answer": "Les petits chemins de fer français sont un type de classe moyenne de . Le groupe est le plus long, de poids lourds. Historique Le , le groupe est également connu comme secondaire d'une man",
+      "reference": "Le Nil",
+      "latency_s": 1.25,
+      "tokens_generated_approx": 35,
+      "overlap_score": 0.5,
+      "exact_match": false
+    },
+    {
+      "id": 3,
+      "category": "Science",
+      "question": "Qu'est-ce que la photosynthèse ?",
+      "answer": "Die verlore, die wat gevra het nie beslis nie! Chefsel en sy kragtige tempo van die opwindende akademiese medium. Hierdie photographies is gebaseer, asook hulle nie eens uitges",
+      "reference": "Processus par lequel les plantes convertissent la lumière en énergie",
+      "latency_s": 1.242,
+      "tokens_generated_approx": 28,
+      "overlap_score": 0.1,
+      "exact_match": false
+    },
+    {
+      "id": 4,
+      "category": "Science",
+      "question": "Combien d'os compte le corps humain adulte ?",
+      "answer": "Die volgende plek is available, sal ons beslis nie opgelewing te koop nie.<eos>",
+      "reference": "206",
+      "latency_s": 0.518,
+      "tokens_generated_approx": 13,
+      "overlap_score": 0.0,
+      "exact_match": false
+    },
+    {
+      "id": 5,
+      "category": "Histoire",
+      "question": "En quelle année a eu lieu la Révolution française ?",
+      "answer": "Les premiers élèves sont des trois fils de Jacques Salamine et Julien Fernandez. Leur nombre d'exemplaires est installée, confirmant la principale section du groupe de musique",
+      "reference": "1789",
+      "latency_s": 1.229,
+      "tokens_generated_approx": 26,
+      "overlap_score": 0.0,
+      "exact_match": false
+    },
+    {
+      "id": 6,
+      "category": "Histoire",
+      "question": "Qui a écrit Les Misérables ?",
+      "answer": "Historia et soutenue, L'Advention d'une partie de la mise en place des règnes du terme de cette matière. Le phénomène ou la présence d'un système de variable est écrit : «",
+      "reference": "Victor Hugo",
+      "latency_s": 1.22,
+      "tokens_generated_approx": 31,
+      "overlap_score": 0.0,
+      "exact_match": false
+    },
+    {
+      "id": 7,
+      "category": "Mathématiques",
+      "question": "Quelle est la formule de l'aire d'un cercle ?",
+      "answer": "Remarque : évoquant le premier au sens interdisant les règles différentes, on utilise les mots de type latin. Le cercle est un facteur plus ou moins électrique et une sorte de poids (",
+      "reference": "pi r carre",
+      "latency_s": 1.224,
+      "tokens_generated_approx": 33,
+      "overlap_score": 0.0,
+      "exact_match": false
+    },
+    {
+      "id": 8,
+      "category": "Langage",
+      "question": "Donne un synonyme du mot heureux.",
+      "answer": "le premier à créer une fois la relation de l'homme, qui serait naturellement réduite. Son grand-père est d'une famille enveloppant le chevalier (ou encore mythique). Les barrag",
+      "reference": "joyeux",
+      "latency_s": 1.228,
+      "tokens_generated_approx": 27,
+      "overlap_score": 0.0,
+      "exact_match": false
+    },
+    {
+      "id": 9,
+      "category": "Raisonnement",
+      "question": "Si j'ai 5 pommes et j'en donne 2, combien m'en reste-t-il ?",
+      "answer": "Loi de la section 1 à 3 par exemple : Dans les différentes factions qui vont à cette occasion, une section de terminaisons n'est pas connue. En janvier 1940, le site est introduit dans la m",
+      "reference": "3",
+      "latency_s": 1.226,
+      "tokens_generated_approx": 36,
+      "overlap_score": 1.0,
+      "exact_match": false
+    },
+    {
+      "id": 10,
+      "category": "Dialogue",
+      "question": "Comment vas-tu aujourd'hui ?",
+      "answer": "Die verskaffinge en maandelikse digitale oor hierdie raadslede is, is nie gebruik van die Vrystaatse kombuisies nie. Dit was die swart-regering in die omgewing aan die beste metode te",
+      "reference": null,
+      "latency_s": 1.212,
+      "tokens_generated_approx": 29,
+      "overlap_score": null,
+      "exact_match": null
+    }
+  ]
+}

aramix_h100/qa_test_report_simple.txt ADDED Viewed

	@@ -0,0 +1,80 @@

+TEST QA SIMPLE — MODÈLE DÉJÀ ENTRAÎNÉ
+============================================================
+[01] Géographie
+  User      : Quelle est la capitale de la France ?
+  Assistant : Au total, le personnage d'une société financière (dont le but à une institution) est une commission de la musique. Il a également considéré plusieurs sociétés, notamment les p
+  Référence : Paris
+  Overlap   : 0%
+  ExactMatch: False
+  Latence   : 1.6s
+[02] Géographie
+  User      : Quel est le plus long fleuve d'Afrique ?
+  Assistant : Les petits chemins de fer français sont un type de classe moyenne de . Le groupe est le plus long, de poids lourds. Historique Le , le groupe est également connu comme secondaire d'une man
+  Référence : Le Nil
+  Overlap   : 50%
+  ExactMatch: False
+  Latence   : 1.25s
+[03] Science
+  User      : Qu'est-ce que la photosynthèse ?
+  Assistant : Die verlore, die wat gevra het nie beslis nie! Chefsel en sy kragtige tempo van die opwindende akademiese medium. Hierdie photographies is gebaseer, asook hulle nie eens uitges
+  Référence : Processus par lequel les plantes convertissent la lumière en énergie
+  Overlap   : 10%
+  ExactMatch: False
+  Latence   : 1.242s
+[04] Science
+  User      : Combien d'os compte le corps humain adulte ?
+  Assistant : Die volgende plek is available, sal ons beslis nie opgelewing te koop nie.<eos>
+  Référence : 206
+  Overlap   : 0%
+  ExactMatch: False
+  Latence   : 0.518s
+[05] Histoire
+  User      : En quelle année a eu lieu la Révolution française ?
+  Assistant : Les premiers élèves sont des trois fils de Jacques Salamine et Julien Fernandez. Leur nombre d'exemplaires est installée, confirmant la principale section du groupe de musique
+  Référence : 1789
+  Overlap   : 0%
+  ExactMatch: False
+  Latence   : 1.229s
+[06] Histoire
+  User      : Qui a écrit Les Misérables ?
+  Assistant : Historia et soutenue, L'Advention d'une partie de la mise en place des règnes du terme de cette matière. Le phénomène ou la présence d'un système de variable est écrit : «
+  Référence : Victor Hugo
+  Overlap   : 0%
+  ExactMatch: False
+  Latence   : 1.22s
+[07] Mathématiques
+  User      : Quelle est la formule de l'aire d'un cercle ?
+  Assistant : Remarque : évoquant le premier au sens interdisant les règles différentes, on utilise les mots de type latin. Le cercle est un facteur plus ou moins électrique et une sorte de poids (
+  Référence : pi r carre
+  Overlap   : 0%
+  ExactMatch: False
+  Latence   : 1.224s
+[08] Langage
+  User      : Donne un synonyme du mot heureux.
+  Assistant : le premier à créer une fois la relation de l'homme, qui serait naturellement réduite. Son grand-père est d'une famille enveloppant le chevalier (ou encore mythique). Les barrag
+  Référence : joyeux
+  Overlap   : 0%
+  ExactMatch: False
+  Latence   : 1.228s
+[09] Raisonnement
+  User      : Si j'ai 5 pommes et j'en donne 2, combien m'en reste-t-il ?
+  Assistant : Loi de la section 1 à 3 par exemple : Dans les différentes factions qui vont à cette occasion, une section de terminaisons n'est pas connue. En janvier 1940, le site est introduit dans la m
+  Référence : 3
+  Overlap   : 100%
+  ExactMatch: False
+  Latence   : 1.226s
+[10] Dialogue
+  User      : Comment vas-tu aujourd'hui ?
+  Assistant : Die verskaffinge en maandelikse digitale oor hierdie raadslede is, is nie gebruik van die Vrystaatse kombuisies nie. Dit was die swart-regering in die omgewing aan die beste metode te
+  Latence   : 1.212s

aramix_h100/tokenizer_32k/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

aramix_h100/tokenizer_32k/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

aramix_h100/train_log.jsonl ADDED Viewed

	@@ -0,0 +1,44 @@

+{"step": 3050, "loss": 3.571237907409668, "ppl": 35.5605866297653, "lr": 0.00012589612961614406, "tok_s": 11144.981072882043, "time": 147.00788259506226}
+{"step": 3100, "loss": 2.8578320932388306, "ppl": 17.423712977652347, "lr": 0.00012168934965573969, "tok_s": 57222.53834418063, "time": 175.63995742797852}
+{"step": 3150, "loss": 2.83519070148468, "ppl": 17.03364833271062, "lr": 0.00011752894782453746, "tok_s": 57001.54441684658, "time": 204.3830382823944}
+{"step": 3200, "loss": 2.9635899543762205, "ppl": 19.36737509653468, "lr": 0.00011341937918502494, "tok_s": 56964.73242461735, "time": 233.14469361305237}
+{"step": 3250, "loss": 2.8826186561584475, "ppl": 17.860983768575373, "lr": 0.0001093650443662347, "tok_s": 57001.06640251769, "time": 261.88801550865173}
+{"step": 3300, "loss": 2.703218548297882, "ppl": 14.927700012805772, "lr": 0.00010537028485144083, "tok_s": 60419.98038551336, "time": 289.00487303733826}
+{"step": 3350, "loss": 2.745184621810913, "ppl": 15.56748776467281, "lr": 0.00010143937832918955, "tok_s": 57034.40571613923, "time": 317.7313930988312}
+{"step": 3400, "loss": 2.7973192358016967, "ppl": 16.400621587712635, "lr": 9.757653411264333e-05, "tok_s": 61073.53656045087, "time": 344.55806946754456}
+{"step": 3450, "loss": 2.7036573982238767, "ppl": 14.934252470519295, "lr": 9.378588863214297e-05, "tok_s": 56988.909283348396, "time": 373.30752301216125}
+{"step": 3500, "loss": 2.5318048357963563, "ppl": 12.576183612317518, "lr": 9.007150100581427e-05, "tok_s": 57049.3650586449, "time": 402.02651047706604}
+{"step": 3500, "val_loss": 2.7343945503234863, "val_ppl": 15.400416435550476, "per_domain": {"arabic_aramix": 2.896586099727042, "french_wiki": 2.638401048920083, "arabic_wiki": 2.9592110232303015, "math_stackexchange": 2.6955147483132103, "multilingual_cc": 2.75966284356334, "stories": 2.5633833775153527, "medical_pubmed": 2.8006927967071533, "medical_flashcards": 2.939271628856659}}
+{"step": 3550, "loss": 2.622337305545807, "ppl": 13.767865716347135, "lr": 8.643734869296278e-05, "tok_s": 11633.872778271438, "time": 542.8566563129425}
+{"step": 3600, "loss": 2.5885059988498687, "ppl": 13.30987177797193, "lr": 8.288732323491074e-05, "tok_s": 56913.92629532449, "time": 571.6439867019653}
+{"step": 3650, "loss": 2.6797654819488526, "ppl": 14.581673229274589, "lr": 7.942522608783706e-05, "tok_s": 57007.48882356021, "time": 600.3840703964233}
+{"step": 3700, "loss": 2.530478653907776, "ppl": 12.559516359730464, "lr": 7.605476455208276e-05, "tok_s": 56725.416636375594, "time": 629.2670667171478}
+{"step": 3750, "loss": 2.569008586406708, "ppl": 13.052877224060135, "lr": 7.277954780228142e-05, "tok_s": 56930.38215870419, "time": 658.0460760593414}
+{"step": 3800, "loss": 2.564538803100586, "ppl": 12.994663888764459, "lr": 6.960308302256383e-05, "tok_s": 56738.58728395681, "time": 686.922367811203}
+{"step": 3850, "loss": 2.656132850646973, "ppl": 14.241109975260919, "lr": 6.652877165097785e-05, "tok_s": 56904.04773033927, "time": 715.7146956920624}
+{"step": 3900, "loss": 2.603713195323944, "ppl": 13.51382445690133, "lr": 6.355990573714333e-05, "tok_s": 56984.61454772562, "time": 744.466315984726}
+{"step": 3950, "loss": 2.4559333181381224, "ppl": 11.657308450637194, "lr": 6.069966441704281e-05, "tok_s": 56748.50220511147, "time": 773.3375625610352}
+{"step": 4000, "loss": 2.4442277646064756, "ppl": 11.521648737554232, "lr": 5.795111050872301e-05, "tok_s": 59304.739097463025, "time": 800.9643597602844}
+{"step": 4000, "val_loss": 2.6130046784877776, "val_ppl": 13.639973075654948, "per_domain": {"arabic_aramix": 2.794638446513438, "french_wiki": 2.522582699096084, "arabic_wiki": 2.849381112424951, "math_stackexchange": 2.5690312363884664, "multilingual_cc": 2.6081621836532247, "stories": 2.4209399079228495, "medical_pubmed": 2.6950340270996094, "medical_flashcards": 2.819118231534958}}
+{"step": 4050, "loss": 2.616236004829407, "ppl": 13.684119567401414, "lr": 5.531718723255281e-05, "tok_s": 11053.809939264273, "time": 949.1847479343414}
+{"step": 4100, "loss": 2.470917589664459, "ppl": 11.833299985286175, "lr": 5.280071505954885e-05, "tok_s": 56891.6055540174, "time": 977.9833726882935}
+{"step": 4150, "loss": 2.51880806684494, "ppl": 12.413791432319185, "lr": 5.0404388691144755e-05, "tok_s": 56854.72699648875, "time": 1006.8006775379181}
+{"step": 4200, "loss": 2.4818945503234864, "ppl": 11.96390918827006, "lr": 4.813077417363728e-05, "tok_s": 56728.39765714226, "time": 1035.682156085968}
+{"step": 4250, "loss": 2.491965615749359, "ppl": 12.085007270264622, "lr": 4.5982306150399575e-05, "tok_s": 56775.80030404367, "time": 1064.5395212173462}
+{"step": 4300, "loss": 2.5507710337638856, "ppl": 12.81698229994284, "lr": 4.3961285254804134e-05, "tok_s": 56859.33243706391, "time": 1093.3544919490814}
+{"step": 4350, "loss": 2.5808962631225585, "ppl": 13.208971570046705, "lr": 4.206987564664711e-05, "tok_s": 58036.5193491192, "time": 1121.584992647171}
+{"step": 4400, "loss": 2.7079242062568665, "ppl": 14.998110196375103, "lr": 4.031010269471151e-05, "tok_s": 57002.2214993296, "time": 1150.3277320861816}
+{"step": 4450, "loss": 2.5953755283355715, "ppl": 13.401619104283894, "lr": 3.868385080795177e-05, "tok_s": 56785.49267694181, "time": 1179.1801717281342}
+{"step": 4500, "loss": 2.5976397037506103, "ppl": 13.43199709835749, "lr": 3.71928614176214e-05, "tok_s": 56872.48957171817, "time": 1207.9884762763977}
+{"step": 4500, "val_loss": 2.547382290661335, "val_ppl": 12.773622348939783, "per_domain": {"arabic_aramix": 2.7358595652868285, "french_wiki": 2.4578250470351537, "arabic_wiki": 2.7882657515375238, "math_stackexchange": 2.504125434702093, "multilingual_cc": 2.5334029245105656, "stories": 2.348106792994908, "medical_pubmed": 2.6415319442749023, "medical_flashcards": 2.757194072008133}}
+{"step": 4550, "loss": 2.638249659538269, "ppl": 13.988697184009707, "lr": 3.583873111250479e-05, "tok_s": 11385.546618796197, "time": 1351.8902189731598}
+{"step": 4600, "loss": 2.582155728340149, "ppl": 13.225618291081892, "lr": 3.462290992924992e-05, "tok_s": 56835.088159816194, "time": 1380.7174813747406}
+{"step": 4650, "loss": 2.5022823572158814, "ppl": 12.210330518102197, "lr": 3.354669979963281e-05, "tok_s": 56918.493716463534, "time": 1409.5025017261505}
+{"step": 4700, "loss": 2.4462553191185, "ppl": 11.545033207070272, "lr": 3.261125315641639e-05, "tok_s": 56923.2943292039, "time": 1438.285094499588}
+{"step": 4750, "loss": 2.3780724477767943, "ppl": 10.784095909082684, "lr": 3.1817571699296604e-05, "tok_s": 57047.299242243054, "time": 1467.0051219463348}
+{"step": 4800, "loss": 2.583693208694458, "ppl": 13.245968059053615, "lr": 3.116650532225727e-05, "tok_s": 56903.56946504383, "time": 1495.797691822052}
+{"step": 4850, "loss": 2.5454819345474244, "ppl": 12.749370968040534, "lr": 3.065875120348237e-05, "tok_s": 56909.14470005539, "time": 1524.5874409675598}
+{"step": 4900, "loss": 2.5480848932266236, "ppl": 12.782600282360375, "lr": 3.029485305880013e-05, "tok_s": 57001.02054004117, "time": 1553.3307859897614}
+{"step": 4950, "loss": 2.5183346843719483, "ppl": 12.407916351721552, "lr": 3.007520055945856e-05, "tok_s": 56954.47416190712, "time": 1582.097621679306}
+{"step": 5000, "loss": 2.488962812423706, "ppl": 12.048772799963553, "lr": 3.0000028914855615e-05, "tok_s": 56955.4739533805, "time": 1610.8639523983002}
+{"step": 5000, "val_loss": 2.5137671560049055, "val_ppl": 12.351372073909298, "per_domain": {"arabic_aramix": 2.7070358647596118, "french_wiki": 2.427249958942895, "arabic_wiki": 2.755421937766828, "math_stackexchange": 2.469518039443276, "multilingual_cc": 2.489755442873998, "stories": 2.308292391536, "medical_pubmed": 2.6124343872070312, "medical_flashcards": 2.7211887538433075}}

aramix_h100/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc13a38fb25354acd0eba8d965216a73cd9fc292a9fa767ad6f27924eb855ac9
+size 5225877311

donner ADDED Viewed

File without changes

nlp_1b_h100_maxvram/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 2048,
+  "d_model": 1536,
+  "n_heads": 24,
+  "n_layers": 24,
+  "d_ff": 6144,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

nlp_1b_h100_maxvram/tokenizer_32k/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nlp_1b_h100_maxvram/tokenizer_32k/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

nlp_1b_h100_opt/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1536,
+  "n_heads": 24,
+  "n_layers": 24,
+  "d_ff": 6144,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

nlp_1b_h100_opt/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:404b4fea027e28fbcabe19077a7ffebb8830de27ca94db6908992d55dcd85e6d
+size 4415622541

nlp_1b_h100_opt/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77b5bc51912e146de6f8855909bc84564dc6c30daa361613c88a11cc41ceb049
+size 4415675901

nlp_1b_h100_opt/tokenizer_32k/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nlp_1b_h100_opt/tokenizer_32k/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

nlp_1b_h100_opt/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a749a6a3258c1a8d1bd9d27d4f1bf0e45454c70bf598e88741b471b8f2afa088
+size 4415677037

nlp_1b_wiki_en_fr_ar/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1536,
+  "n_heads": 24,
+  "n_layers": 24,
+  "d_ff": 6144,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

nlp_1b_wiki_en_fr_ar/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc91b83ddaf6edf5e20142465b16b89b4505fc04eb5ef26680dae6839c030118
+size 11462571709

nlp_1b_wiki_en_fr_ar/model_epoch_02.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:243fcb18b86fdcc825307bc552f4392aee2a996217a48e56e650c3fd00257fd3
+size 11462574453

nlp_1b_wiki_en_fr_ar/tokenizer_32k/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nlp_1b_wiki_en_fr_ar/tokenizer_32k/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

simple_qa_test_aramix.py ADDED Viewed

	@@ -0,0 +1,504 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+simple_qa_test_aramix.py
+Test QA simple pour un modèle déjà entraîné dans une repo de type :
+- train_aramix_h100_full.py
+- aramix_h100/
+    - config.json
+    - model_best.pt
+    - model.pt
+    - tokenizer_32k/
+Hypothèses alignées avec ton repo :
+- le module d'entraînement expose : GPT, GPTConfig, train_or_load_tokenizer,
+  load_checkpoint, DOMAINS
+- le tokenizer est géré par train_or_load_tokenizer(DOMAINS)
+- le checkpoint se recharge avec load_checkpoint(model, opt, ckpt_path, device)
+Usage
+-----
+python simple_qa_test_aramix.py
+python simple_qa_test_aramix.py --repo_dir ./aramix_h100
+python simple_qa_test_aramix.py --ckpt ./aramix_h100/model.pt
+python simple_qa_test_aramix.py --questions qa_questions.json
+python simple_qa_test_aramix.py --max_new_tokens 96 --temperature 0.4 --top_k 40
+python simple_qa_test_aramix.py --save_report
+"""
+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import os
+import re
+import sys
+import time
+import unicodedata
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import torch
+import torch.nn.functional as F
+DEFAULT_QUESTIONS = [
+    {
+        "category": "Géographie",
+        "question": "Quelle est la capitale de la France ?",
+        "reference": "Paris",
+    },
+    {
+        "category": "Géographie",
+        "question": "Quel est le plus long fleuve d'Afrique ?",
+        "reference": "Le Nil",
+    },
+    {
+        "category": "Science",
+        "question": "Qu'est-ce que la photosynthèse ?",
+        "reference": "Processus par lequel les plantes convertissent la lumière en énergie",
+    },
+    {
+        "category": "Science",
+        "question": "Combien d'os compte le corps humain adulte ?",
+        "reference": "206",
+    },
+    {
+        "category": "Histoire",
+        "question": "En quelle année a eu lieu la Révolution française ?",
+        "reference": "1789",
+    },
+    {
+        "category": "Histoire",
+        "question": "Qui a écrit Les Misérables ?",
+        "reference": "Victor Hugo",
+    },
+    {
+        "category": "Mathématiques",
+        "question": "Quelle est la formule de l'aire d'un cercle ?",
+        "reference": "pi r carre",
+    },
+    {
+        "category": "Langage",
+        "question": "Donne un synonyme du mot heureux.",
+        "reference": "joyeux",
+    },
+    {
+        "category": "Raisonnement",
+        "question": "Si j'ai 5 pommes et j'en donne 2, combien m'en reste-t-il ?",
+        "reference": "3",
+    },
+    {
+        "category": "Dialogue",
+        "question": "Comment vas-tu aujourd'hui ?",
+        "reference": None,
+    },
+]
+def load_module_from_file(py_path: Path):
+    spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Impossible de charger le module: {py_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[py_path.stem] = module
+    spec.loader.exec_module(module)
+    return module
+def normalize_text(text: str) -> str:
+    text = (text or "").strip().lower()
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(ch for ch in text if not unicodedata.combining(ch))
+    text = text.replace("π", "pi")
+    text = re.sub(r"[\W_]+", " ", text, flags=re.UNICODE)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def lexical_overlap(reference: Optional[str], answer: str) -> Optional[float]:
+    if not reference:
+        return None
+    ref = set(normalize_text(reference).split())
+    ans = set(normalize_text(answer).split())
+    if not ref:
+        return None
+    return len(ref & ans) / len(ref)
+def exact_match(reference: Optional[str], answer: str) -> Optional[bool]:
+    if not reference:
+        return None
+    return normalize_text(reference) == normalize_text(answer)
+def infer_repo_defaults(repo_dir: Path):
+    train_script = repo_dir.parent / "train_aramix_h100_full.py"
+    if not train_script.exists():
+        train_script = repo_dir / "train_aramix_h100_full.py"
+    ckpt = repo_dir / "model_best.pt"
+    if not ckpt.exists():
+        ckpt = repo_dir / "model.pt"
+    config = repo_dir / "config.json"
+    tokenizer_dir = repo_dir / "tokenizer_32k"
+    return train_script, ckpt, config, tokenizer_dir
+def safe_get(cfg: Dict[str, Any], *names: str, default=None):
+    for name in names:
+        if name in cfg:
+            return cfg[name]
+    return default
+def build_model_config_dict(cfg_json: Dict[str, Any], vocab_size: int) -> Dict[str, Any]:
+    block_size = safe_get(cfg_json, "block_size", "max_seq_len", "seq_len", default=512)
+    d_model = safe_get(cfg_json, "d_model", "n_embd", "dim", default=768)
+    n_heads = safe_get(cfg_json, "n_heads", "n_head", "num_heads", default=12)
+    n_layers = safe_get(cfg_json, "n_layers", "n_layer", "num_layers", default=12)
+    d_ff = safe_get(cfg_json, "d_ff", "ffn_dim", "intermediate_size", default=d_model * 4)
+    return {
+        "vocab_size": vocab_size,
+        "block_size": int(block_size),
+        "d_model": int(d_model),
+        "n_heads": int(n_heads),
+        "n_layers": int(n_layers),
+        "d_ff": int(d_ff),
+    }
+class AramixChatTester:
+    def __init__(
+        self,
+        repo_dir: Path,
+        train_script: Path,
+        ckpt_path: Path,
+        config_path: Path,
+        device: Optional[str] = None,
+    ):
+        self.repo_dir = repo_dir
+        self.train_script = train_script
+        self.ckpt_path = ckpt_path
+        self.config_path = config_path
+        self.device = torch.device(
+            device if device is not None else ("cuda" if torch.cuda.is_available() else "cpu")
+        )
+        self.M = load_module_from_file(self.train_script)
+        required = ["GPT", "GPTConfig", "train_or_load_tokenizer", "load_checkpoint", "DOMAINS"]
+        missing = [x for x in required if not hasattr(self.M, x)]
+        if missing:
+            raise RuntimeError(
+                f"Le fichier {self.train_script.name} ne contient pas les symboles attendus: {missing}"
+            )
+        self.cfg_json: Dict[str, Any] = {}
+        if self.config_path.exists():
+            with open(self.config_path, "r", encoding="utf-8") as f:
+                self.cfg_json = json.load(f)
+        self.tokenizer = self._load_tokenizer()
+        self.model = self._load_model()
+    def _load_tokenizer(self):
+        old_cwd = Path.cwd()
+        try:
+            os.chdir(self.repo_dir.parent)
+            tok = self.M.train_or_load_tokenizer(self.M.DOMAINS)
+        finally:
+            os.chdir(old_cwd)
+        return tok
+    def _make_gpt_config(self):
+        kwargs = build_model_config_dict(self.cfg_json, vocab_size=len(self.tokenizer))
+        try:
+            return self.M.GPTConfig(**kwargs)
+        except TypeError:
+            return self.M.GPTConfig(vocab_size=len(self.tokenizer))
+    def _load_model(self):
+        cfg = self._make_gpt_config()
+        model = self.M.GPT(cfg).to(self.device)
+        try:
+            self.M.load_checkpoint(model, None, self.ckpt_path, self.device)
+        except TypeError:
+            try:
+                self.M.load_checkpoint(model, self.ckpt_path, self.device)
+            except TypeError:
+                ckpt = torch.load(self.ckpt_path, map_location=self.device)
+                state = ckpt["model"] if isinstance(ckpt, dict) and "model" in ckpt else ckpt
+                if any(k.startswith("_orig_mod.") for k in state):
+                    state = {k.replace("_orig_mod.", ""): v for k, v in state.items()}
+                model.load_state_dict(state, strict=False)
+        model.eval()
+        return model
+    def encode_prompt(self, question: str) -> List[int]:
+        bos = getattr(self.tokenizer, "bos_token_id", None)
+        eos = getattr(self.tokenizer, "eos_token_id", None)
+        prompt = f"Question: {question}\nRéponse:"
+        ids = self.tokenizer.encode(prompt, add_special_tokens=False)
+        if bos is not None:
+            ids = [bos] + ids
+        if eos is not None and len(ids) > 0 and ids[-1] == eos:
+            ids = ids[:-1]
+        return ids
+    @torch.no_grad()
+    def generate(
+        self,
+        question: str,
+        max_new_tokens: int = 96,
+        temperature: float = 0.4,
+        top_k: int = 40,
+        repetition_penalty: float = 1.12,
+    ) -> str:
+        ids = self.encode_prompt(question)
+        x = torch.tensor([ids], dtype=torch.long, device=self.device)
+        eos_id = getattr(self.tokenizer, "eos_token_id", None)
+        block_size = getattr(getattr(self.model, "cfg", None), "block_size", None)
+        if block_size is None:
+            block_size = safe_get(self.cfg_json, "block_size", "max_seq_len", default=512)
+        for step in range(max_new_tokens):
+            x_ctx = x[:, -int(block_size):]
+            try:
+                logits, _ = self.model(x_ctx)
+            except TypeError:
+                out = self.model(x_ctx)
+                logits = out[0] if isinstance(out, tuple) else out
+            logits = logits[:, -1, :]
+            recent = x[0, -64:].tolist()
+            for tok in set(recent):
+                logits[0, tok] /= repetition_penalty
+            if temperature <= 0:
+                next_tok = torch.argmax(logits, dim=-1, keepdim=True)
+            else:
+                logits = logits / max(temperature, 1e-5)
+                if top_k is not None and top_k > 0:
+                    values, _ = torch.topk(logits, k=min(top_k, logits.size(-1)))
+                    kth = values[:, -1].unsqueeze(-1)
+                    logits = torch.where(logits < kth, torch.full_like(logits, float("-inf")), logits)
+                probs = F.softmax(logits, dim=-1)
+                next_tok = torch.multinomial(probs, num_samples=1)
+            x = torch.cat([x, next_tok], dim=1)
+            if eos_id is not None and next_tok.item() == eos_id and step >= 2:
+                break
+        new_ids = x[0, len(ids):].tolist()
+        text = self.tokenizer.decode(new_ids).strip()
+        text = re.sub(r"\s+", " ", text).strip()
+        text = text.replace("Réponse :", "").replace("Réponse:", "").strip()
+        return text
+def load_questions(path: Optional[str]) -> List[Dict[str, Any]]:
+    if not path:
+        return DEFAULT_QUESTIONS
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        raise ValueError("Le fichier questions doit contenir une liste JSON.")
+    return data
+def format_bar(score: float, width: int = 20) -> str:
+    n = max(0, min(width, int(round(score * width))))
+    return "█" * n + "░" * (width - n)
+def save_reports(output_dir: Path, summary: Dict[str, Any]) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    json_path = output_dir / "qa_test_report_simple.json"
+    txt_path = output_dir / "qa_test_report_simple.txt"
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, ensure_ascii=False, indent=2)
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write("TEST QA SIMPLE — MODÈLE DÉJÀ ENTRAÎNÉ\n")
+        f.write("=" * 60 + "\n\n")
+        for r in summary["results"]:
+            f.write(f"[{r['id']:02d}] {r['category']}\n")
+            f.write(f"  User      : {r['question']}\n")
+            f.write(f"  Assistant : {r['answer']}\n")
+            if r["reference"]:
+                f.write(f"  Référence : {r['reference']}\n")
+            if r["overlap_score"] is not None:
+                f.write(f"  Overlap   : {r['overlap_score']:.0%}\n")
+            if r["exact_match"] is not None:
+                f.write(f"  ExactMatch: {r['exact_match']}\n")
+            f.write(f"  Latence   : {r['latency_s']}s\n\n")
+def main():
+    parser = argparse.ArgumentParser("Test QA simple pour modèle Aramix déjà entraîné")
+    parser.add_argument("--repo_dir", type=str, default="./aramix_h100")
+    parser.add_argument("--train_script", type=str, default=None)
+    parser.add_argument("--ckpt", type=str, default=None)
+    parser.add_argument("--config", type=str, default=None)
+    parser.add_argument("--questions", type=str, default=None)
+    parser.add_argument("--max_new_tokens", type=int, default=96)
+    parser.add_argument("--temperature", type=float, default=0.4)
+    parser.add_argument("--top_k", type=int, default=40)
+    parser.add_argument("--repetition_penalty", type=float, default=1.12)
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--save_report", action="store_true")
+    args = parser.parse_args()
+    repo_dir = Path(args.repo_dir).resolve()
+    train_script, ckpt_path, config_path, tokenizer_dir = infer_repo_defaults(repo_dir)
+    if args.train_script:
+        train_script = Path(args.train_script).resolve()
+    if args.ckpt:
+        ckpt_path = Path(args.ckpt).resolve()
+    if args.config:
+        config_path = Path(args.config).resolve()
+    if not train_script.exists():
+        raise FileNotFoundError(f"Script train introuvable: {train_script}")
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint introuvable: {ckpt_path}")
+    if not config_path.exists():
+        print(f"[WARN] config.json introuvable: {config_path} — fallback sur GPTConfig(vocab_size=...).")
+    questions = load_questions(args.questions)
+    tester = AramixChatTester(
+        repo_dir=repo_dir,
+        train_script=train_script,
+        ckpt_path=ckpt_path,
+        config_path=config_path,
+        device=args.device,
+    )
+    results: List[Dict[str, Any]] = []
+    categories: Dict[str, List[Dict[str, Any]]] = {}
+    print("\n" + "═" * 70)
+    print(" TEST QA SIMPLE — MODÈLE DÉJÀ ENTRAÎNÉ")
+    print("═" * 70)
+    print(f"Repo        : {repo_dir}")
+    print(f"Train script: {train_script}")
+    print(f"Checkpoint  : {ckpt_path}")
+    print(f"Config      : {config_path}")
+    print(f"Tokenizer   : {tokenizer_dir}")
+    print(f"Device      : {tester.device}")
+    print(f"Questions   : {len(questions)}")
+    print("═" * 70 + "\n")
+    for i, item in enumerate(questions, 1):
+        q = item["question"]
+        ref = item.get("reference")
+        cat = item.get("category", "Général")
+        t0 = time.time()
+        ans = tester.generate(
+            q,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            repetition_penalty=args.repetition_penalty,
+        )
+        latency = time.time() - t0
+        overlap = lexical_overlap(ref, ans)
+        em = exact_match(ref, ans)
+        entry = {
+            "id": i,
+            "category": cat,
+            "question": q,
+            "answer": ans,
+            "reference": ref,
+            "latency_s": round(latency, 3),
+            "tokens_generated_approx": len(ans.split()),
+            "overlap_score": None if overlap is None else round(overlap, 4),
+            "exact_match": em,
+        }
+        results.append(entry)
+        categories.setdefault(cat, []).append(entry)
+        overlap_str = f"{overlap:.0%}" if overlap is not None else "n/a"
+        em_str = "✓" if em else ("✗" if em is not None else "n/a")
+        print("─" * 70)
+        print(f"[{i:02d}] [{cat}] overlap={overlap_str} | EM={em_str}")
+        print(f"  User      : {q}")
+        print(f"  Assistant : {ans}")
+        if ref:
+            print(f"  Référence : {ref}")
+        print(f"  ⏱ {latency:.2f}s | ~{entry['tokens_generated_approx']} mots\n")
+    scored_overlap = [r["overlap_score"] for r in results if r["overlap_score"] is not None]
+    scored_em = [r["exact_match"] for r in results if r["exact_match"] is not None]
+    avg_overlap = sum(scored_overlap) / len(scored_overlap) if scored_overlap else 0.0
+    em_rate = sum(1 for x in scored_em if x) / len(scored_em) if scored_em else 0.0
+    avg_latency = sum(r["latency_s"] for r in results) / len(results) if results else 0.0
+    avg_words = sum(r["tokens_generated_approx"] for r in results) / len(results) if results else 0.0
+    cat_scores: Dict[str, float] = {}
+    for cat, items in categories.items():
+        vals = [r["overlap_score"] for r in items if r["overlap_score"] is not None]
+        cat_scores[cat] = (sum(vals) / len(vals)) if vals else 0.0
+    summary = {
+        "repo_dir": str(repo_dir),
+        "train_script": str(train_script),
+        "checkpoint": str(ckpt_path),
+        "config_path": str(config_path),
+        "tokenizer_dir": str(tokenizer_dir),
+        "device": str(tester.device),
+        "total_questions": len(results),
+        "avg_overlap_score": round(avg_overlap, 4),
+        "exact_match_rate": round(em_rate, 4),
+        "avg_latency_s": round(avg_latency, 3),
+        "avg_words_generated": round(avg_words, 1),
+        "scores_by_category": {k: round(v, 4) for k, v in cat_scores.items()},
+        "results": results,
+    }
+    print("═" * 70)
+    print(" RÉSUMÉ")
+    print("═" * 70)
+    print(f"Questions testées : {len(results)}")
+    print(f"Overlap moyen     : {avg_overlap:.1%}")
+    print(f"Exact match       : {em_rate:.1%}")
+    print(f"Latence moyenne   : {avg_latency:.2f}s")
+    print(f"Mots moyens       : {avg_words:.1f}")
+    print("Scores / catégorie:")
+    for cat, score in sorted(cat_scores.items()):
+        print(f"  {cat:<15} {format_bar(score)} {score:.0%}")
+    print("═" * 70)
+    if args.save_report:
+        save_reports(repo_dir, summary)
+        print(f"Rapports sauvegardés dans : {repo_dir / 'qa_test_report_simple.json'}")
+        print(f"                           {repo_dir / 'qa_test_report_simple.txt'}")
+if __name__ == "__main__":
+    main()

simple_qa_test_aramix_v2.py ADDED Viewed

	@@ -0,0 +1,472 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+simple_qa_test_aramix_v2.py
+Version corrigée du test QA simple pour repo Aramix.
+Correction principale :
+- gère les checkpoints sauvés depuis torch.compile() avec préfixe "_orig_mod."
+- contourne load_checkpoint() du script train si celui-ci échoue sur ce cas
+Usage
+-----
+python simple_qa_test_aramix_v2.py --repo_dir ./aramix_h100 --save_report
+"""
+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import os
+import re
+import sys
+import time
+import unicodedata
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import torch
+import torch.nn.functional as F
+DEFAULT_QUESTIONS = [
+    {"category": "Géographie", "question": "Quelle est la capitale de la France ?", "reference": "Paris"},
+    {"category": "Géographie", "question": "Quel est le plus long fleuve d'Afrique ?", "reference": "Le Nil"},
+    {"category": "Science", "question": "Qu'est-ce que la photosynthèse ?", "reference": "Processus par lequel les plantes convertissent la lumière en énergie"},
+    {"category": "Science", "question": "Combien d'os compte le corps humain adulte ?", "reference": "206"},
+    {"category": "Histoire", "question": "En quelle année a eu lieu la Révolution française ?", "reference": "1789"},
+    {"category": "Histoire", "question": "Qui a écrit Les Misérables ?", "reference": "Victor Hugo"},
+    {"category": "Mathématiques", "question": "Quelle est la formule de l'aire d'un cercle ?", "reference": "pi r carre"},
+    {"category": "Langage", "question": "Donne un synonyme du mot heureux.", "reference": "joyeux"},
+    {"category": "Raisonnement", "question": "Si j'ai 5 pommes et j'en donne 2, combien m'en reste-t-il ?", "reference": "3"},
+    {"category": "Dialogue", "question": "Comment vas-tu aujourd'hui ?", "reference": None},
+]
+def load_module_from_file(py_path: Path):
+    spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Impossible de charger le module: {py_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[py_path.stem] = module
+    spec.loader.exec_module(module)
+    return module
+def normalize_text(text: str) -> str:
+    text = (text or "").strip().lower()
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(ch for ch in text if not unicodedata.combining(ch))
+    text = text.replace("π", "pi")
+    text = re.sub(r"[\W_]+", " ", text, flags=re.UNICODE)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def lexical_overlap(reference: Optional[str], answer: str) -> Optional[float]:
+    if not reference:
+        return None
+    ref = set(normalize_text(reference).split())
+    ans = set(normalize_text(answer).split())
+    if not ref:
+        return None
+    return len(ref & ans) / len(ref)
+def exact_match(reference: Optional[str], answer: str) -> Optional[bool]:
+    if not reference:
+        return None
+    return normalize_text(reference) == normalize_text(answer)
+def infer_repo_defaults(repo_dir: Path):
+    train_script = repo_dir.parent / "train_aramix_h100_full.py"
+    if not train_script.exists():
+        train_script = repo_dir / "train_aramix_h100_full.py"
+    ckpt = repo_dir / "model_best.pt"
+    if not ckpt.exists():
+        ckpt = repo_dir / "model.pt"
+    config = repo_dir / "config.json"
+    tokenizer_dir = repo_dir / "tokenizer_32k"
+    return train_script, ckpt, config, tokenizer_dir
+def safe_get(cfg: Dict[str, Any], *names: str, default=None):
+    for name in names:
+        if name in cfg:
+            return cfg[name]
+    return default
+def build_model_config_dict(cfg_json: Dict[str, Any], vocab_size: int) -> Dict[str, Any]:
+    block_size = safe_get(cfg_json, "block_size", "max_seq_len", "seq_len", default=512)
+    d_model = safe_get(cfg_json, "d_model", "n_embd", "dim", default=768)
+    n_heads = safe_get(cfg_json, "n_heads", "n_head", "num_heads", default=12)
+    n_layers = safe_get(cfg_json, "n_layers", "n_layer", "num_layers", default=12)
+    d_ff = safe_get(cfg_json, "d_ff", "ffn_dim", "intermediate_size", default=d_model * 4)
+    return {
+        "vocab_size": vocab_size,
+        "block_size": int(block_size),
+        "d_model": int(d_model),
+        "n_heads": int(n_heads),
+        "n_layers": int(n_layers),
+        "d_ff": int(d_ff),
+    }
+def strip_orig_mod_prefix(state: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    if any(k.startswith("_orig_mod.") for k in state.keys()):
+        return {k.replace("_orig_mod.", "", 1): v for k, v in state.items()}
+    return state
+class AramixChatTester:
+    def __init__(
+        self,
+        repo_dir: Path,
+        train_script: Path,
+        ckpt_path: Path,
+        config_path: Path,
+        device: Optional[str] = None,
+    ):
+        self.repo_dir = repo_dir
+        self.train_script = train_script
+        self.ckpt_path = ckpt_path
+        self.config_path = config_path
+        self.device = torch.device(device if device is not None else ("cuda" if torch.cuda.is_available() else "cpu"))
+        self.M = load_module_from_file(self.train_script)
+        required = ["GPT", "GPTConfig", "train_or_load_tokenizer", "DOMAINS"]
+        missing = [x for x in required if not hasattr(self.M, x)]
+        if missing:
+            raise RuntimeError(f"Le fichier {self.train_script.name} ne contient pas les symboles attendus: {missing}")
+        self.cfg_json: Dict[str, Any] = {}
+        if self.config_path.exists():
+            with open(self.config_path, "r", encoding="utf-8") as f:
+                self.cfg_json = json.load(f)
+        self.tokenizer = self._load_tokenizer()
+        self.model = self._load_model()
+    def _load_tokenizer(self):
+        old_cwd = Path.cwd()
+        try:
+            os.chdir(self.repo_dir.parent)
+            tok = self.M.train_or_load_tokenizer(self.M.DOMAINS)
+        finally:
+            os.chdir(old_cwd)
+        return tok
+    def _make_gpt_config(self):
+        kwargs = build_model_config_dict(self.cfg_json, vocab_size=len(self.tokenizer))
+        try:
+            return self.M.GPTConfig(**kwargs)
+        except TypeError:
+            return self.M.GPTConfig(vocab_size=len(self.tokenizer))
+    def _manual_load_state(self, model: torch.nn.Module):
+        ckpt = torch.load(self.ckpt_path, map_location=self.device)
+        if isinstance(ckpt, dict) and "model" in ckpt:
+            state = ckpt["model"]
+        else:
+            state = ckpt
+        if not isinstance(state, dict):
+            raise RuntimeError("Checkpoint non reconnu: pas de state_dict exploitable.")
+        state = strip_orig_mod_prefix(state)
+        missing, unexpected = model.load_state_dict(state, strict=False)
+        # tolérance uniquement sur lm_head/tied weights éventuels, sinon on échoue
+        hard_missing = [k for k in missing if not k.endswith("lm_head.weight")]
+        hard_unexpected = [k for k in unexpected if not k.startswith("_orig_mod.")]
+        if hard_missing or hard_unexpected:
+            raise RuntimeError(
+                "Chargement manuel incomplet.\n"
+                f"Missing: {hard_missing[:20]}\n"
+                f"Unexpected: {hard_unexpected[:20]}"
+            )
+    def _load_model(self):
+        cfg = self._make_gpt_config()
+        model = self.M.GPT(cfg).to(self.device)
+        # 1) tentative via load_checkpoint du script train
+        if hasattr(self.M, "load_checkpoint"):
+            try:
+                try:
+                    self.M.load_checkpoint(model, None, self.ckpt_path, self.device)
+                    model.eval()
+                    return model
+                except TypeError:
+                    self.M.load_checkpoint(model, self.ckpt_path, self.device)
+                    model.eval()
+                    return model
+            except RuntimeError as e:
+                msg = str(e)
+                if "_orig_mod." not in msg:
+                    raise
+        # 2) fallback robuste
+        self._manual_load_state(model)
+        model.eval()
+        return model
+    def encode_prompt(self, question: str) -> List[int]:
+        bos = getattr(self.tokenizer, "bos_token_id", None)
+        eos = getattr(self.tokenizer, "eos_token_id", None)
+        prompt = f"Question: {question}\nRéponse:"
+        ids = self.tokenizer.encode(prompt, add_special_tokens=False)
+        if bos is not None:
+            ids = [bos] + ids
+        if eos is not None and ids and ids[-1] == eos:
+            ids = ids[:-1]
+        return ids
+    @torch.no_grad()
+    def generate(
+        self,
+        question: str,
+        max_new_tokens: int = 96,
+        temperature: float = 0.4,
+        top_k: int = 40,
+        repetition_penalty: float = 1.12,
+    ) -> str:
+        ids = self.encode_prompt(question)
+        x = torch.tensor([ids], dtype=torch.long, device=self.device)
+        eos_id = getattr(self.tokenizer, "eos_token_id", None)
+        block_size = getattr(getattr(self.model, "cfg", None), "block_size", None)
+        if block_size is None:
+            block_size = safe_get(self.cfg_json, "block_size", "max_seq_len", default=512)
+        for step in range(max_new_tokens):
+            x_ctx = x[:, -int(block_size):]
+            out = self.model(x_ctx)
+            logits = out[0] if isinstance(out, tuple) else out
+            logits = logits[:, -1, :]
+            recent = x[0, -64:].tolist()
+            for tok in set(recent):
+                logits[0, tok] /= repetition_penalty
+            if temperature <= 0:
+                next_tok = torch.argmax(logits, dim=-1, keepdim=True)
+            else:
+                logits = logits / max(temperature, 1e-5)
+                if top_k is not None and top_k > 0:
+                    values, _ = torch.topk(logits, k=min(top_k, logits.size(-1)))
+                    kth = values[:, -1].unsqueeze(-1)
+                    logits = torch.where(logits < kth, torch.full_like(logits, float("-inf")), logits)
+                probs = F.softmax(logits, dim=-1)
+                next_tok = torch.multinomial(probs, num_samples=1)
+            x = torch.cat([x, next_tok], dim=1)
+            if eos_id is not None and next_tok.item() == eos_id and step >= 2:
+                break
+        new_ids = x[0, len(ids):].tolist()
+        text = self.tokenizer.decode(new_ids).strip()
+        text = re.sub(r"\s+", " ", text).strip()
+        text = text.replace("Réponse :", "").replace("Réponse:", "").strip()
+        return text
+def load_questions(path: Optional[str]) -> List[Dict[str, Any]]:
+    if not path:
+        return DEFAULT_QUESTIONS
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        raise ValueError("Le fichier questions doit contenir une liste JSON.")
+    return data
+def format_bar(score: float, width: int = 20) -> str:
+    n = max(0, min(width, int(round(score * width))))
+    return "█" * n + "░" * (width - n)
+def save_reports(output_dir: Path, summary: Dict[str, Any]) -> None:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    json_path = output_dir / "qa_test_report_simple.json"
+    txt_path = output_dir / "qa_test_report_simple.txt"
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, ensure_ascii=False, indent=2)
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write("TEST QA SIMPLE — MODÈLE DÉJÀ ENTRAÎNÉ\n")
+        f.write("=" * 60 + "\n\n")
+        for r in summary["results"]:
+            f.write(f"[{r['id']:02d}] {r['category']}\n")
+            f.write(f"  User      : {r['question']}\n")
+            f.write(f"  Assistant : {r['answer']}\n")
+            if r["reference"]:
+                f.write(f"  Référence : {r['reference']}\n")
+            if r["overlap_score"] is not None:
+                f.write(f"  Overlap   : {r['overlap_score']:.0%}\n")
+            if r["exact_match"] is not None:
+                f.write(f"  ExactMatch: {r['exact_match']}\n")
+            f.write(f"  Latence   : {r['latency_s']}s\n\n")
+def main():
+    parser = argparse.ArgumentParser("Test QA simple pour modèle Aramix déjà entraîné")
+    parser.add_argument("--repo_dir", type=str, default="./aramix_h100")
+    parser.add_argument("--train_script", type=str, default=None)
+    parser.add_argument("--ckpt", type=str, default=None)
+    parser.add_argument("--config", type=str, default=None)
+    parser.add_argument("--questions", type=str, default=None)
+    parser.add_argument("--max_new_tokens", type=int, default=96)
+    parser.add_argument("--temperature", type=float, default=0.4)
+    parser.add_argument("--top_k", type=int, default=40)
+    parser.add_argument("--repetition_penalty", type=float, default=1.12)
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--save_report", action="store_true")
+    args = parser.parse_args()
+    repo_dir = Path(args.repo_dir).resolve()
+    train_script, ckpt_path, config_path, tokenizer_dir = infer_repo_defaults(repo_dir)
+    if args.train_script:
+        train_script = Path(args.train_script).resolve()
+    if args.ckpt:
+        ckpt_path = Path(args.ckpt).resolve()
+    if args.config:
+        config_path = Path(args.config).resolve()
+    if not train_script.exists():
+        raise FileNotFoundError(f"Script train introuvable: {train_script}")
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint introuvable: {ckpt_path}")
+    if not config_path.exists():
+        print(f"[WARN] config.json introuvable: {config_path} — fallback sur GPTConfig(vocab_size=...).")
+    questions = load_questions(args.questions)
+    tester = AramixChatTester(
+        repo_dir=repo_dir,
+        train_script=train_script,
+        ckpt_path=ckpt_path,
+        config_path=config_path,
+        device=args.device,
+    )
+    results: List[Dict[str, Any]] = []
+    categories: Dict[str, List[Dict[str, Any]]] = {}
+    print("\n" + "═" * 70)
+    print(" TEST QA SIMPLE — MODÈLE DÉJÀ ENTRAÎNÉ")
+    print("═" * 70)
+    print(f"Repo        : {repo_dir}")
+    print(f"Train script: {train_script}")
+    print(f"Checkpoint  : {ckpt_path}")
+    print(f"Config      : {config_path}")
+    print(f"Tokenizer   : {tokenizer_dir}")
+    print(f"Device      : {tester.device}")
+    print(f"Questions   : {len(questions)}")
+    print("═" * 70 + "\n")
+    for i, item in enumerate(questions, 1):
+        q = item["question"]
+        ref = item.get("reference")
+        cat = item.get("category", "Général")
+        t0 = time.time()
+        ans = tester.generate(
+            q,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            repetition_penalty=args.repetition_penalty,
+        )
+        latency = time.time() - t0
+        overlap = lexical_overlap(ref, ans)
+        em = exact_match(ref, ans)
+        entry = {
+            "id": i,
+            "category": cat,
+            "question": q,
+            "answer": ans,
+            "reference": ref,
+            "latency_s": round(latency, 3),
+            "tokens_generated_approx": len(ans.split()),
+            "overlap_score": None if overlap is None else round(overlap, 4),
+            "exact_match": em,
+        }
+        results.append(entry)
+        categories.setdefault(cat, []).append(entry)
+        overlap_str = f"{overlap:.0%}" if overlap is not None else "n/a"
+        em_str = "✓" if em else ("✗" if em is not None else "n/a")
+        print("─" * 70)
+        print(f"[{i:02d}] [{cat}] overlap={overlap_str} | EM={em_str}")
+        print(f"  User      : {q}")
+        print(f"  Assistant : {ans}")
+        if ref:
+            print(f"  Référence : {ref}")
+        print(f"  ⏱ {latency:.2f}s | ~{entry['tokens_generated_approx']} mots\n")
+    scored_overlap = [r["overlap_score"] for r in results if r["overlap_score"] is not None]
+    scored_em = [r["exact_match"] for r in results if r["exact_match"] is not None]
+    avg_overlap = sum(scored_overlap) / len(scored_overlap) if scored_overlap else 0.0
+    em_rate = sum(1 for x in scored_em if x) / len(scored_em) if scored_em else 0.0
+    avg_latency = sum(r["latency_s"] for r in results) / len(results) if results else 0.0
+    avg_words = sum(r["tokens_generated_approx"] for r in results) / len(results) if results else 0.0
+    cat_scores: Dict[str, float] = {}
+    for cat, items in categories.items():
+        vals = [r["overlap_score"] for r in items if r["overlap_score"] is not None]
+        cat_scores[cat] = (sum(vals) / len(vals)) if vals else 0.0
+    summary = {
+        "repo_dir": str(repo_dir),
+        "train_script": str(train_script),
+        "checkpoint": str(ckpt_path),
+        "config_path": str(config_path),
+        "tokenizer_dir": str(tokenizer_dir),
+        "device": str(tester.device),
+        "total_questions": len(results),
+        "avg_overlap_score": round(avg_overlap, 4),
+        "exact_match_rate": round(em_rate, 4),
+        "avg_latency_s": round(avg_latency, 3),
+        "avg_words_generated": round(avg_words, 1),
+        "scores_by_category": {k: round(v, 4) for k, v in cat_scores.items()},
+        "results": results,
+    }
+    print("═" * 70)
+    print(" RÉSUMÉ")
+    print("═" * 70)
+    print(f"Questions testées : {len(results)}")
+    print(f"Overlap moyen     : {avg_overlap:.1%}")
+    print(f"Exact match       : {em_rate:.1%}")
+    print(f"Latence moyenne   : {avg_latency:.2f}s")
+    print(f"Mots moyens       : {avg_words:.1f}")
+    print("Scores / catégorie:")
+    for cat, score in sorted(cat_scores.items()):
+        print(f"  {cat:<15} {format_bar(score)} {score:.0%}")
+    print("═" * 70)
+    if args.save_report:
+        save_reports(repo_dir, summary)
+        print(f"Rapports sauvegardés dans : {repo_dir / 'qa_test_report_simple.json'}")
+        print(f"                           {repo_dir / 'qa_test_report_simple.txt'}")
+if __name__ == "__main__":
+    main()

simple_qa_test_aramix_v3.py ADDED Viewed

	@@ -0,0 +1,583 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+simple_qa_test_aramix_v3.py
+Test QA simple et strict pour un modèle déjà entraîné dans une repo de type :
+- train_aramix_h100_full.py
+- aramix_h100/
+    - config.json
+    - model_best.pt
+    - model.pt
+    - tokenizer_32k/
+Cette version améliore le test QA en :
+- gérant les checkpoints torch.compile() avec préfixe "_orig_mod."
+- utilisant un prompt plus directif pour des réponses courtes
+- privilégiant une génération greedy / quasi-greedy
+- tronquant proprement les réponses trop longues
+- ajoutant une métrique "contains_reference"
+Usage
+-----
+python simple_qa_test_aramix_v3.py --repo_dir ./aramix_h100 --save_report
+python simple_qa_test_aramix_v3.py --repo_dir ./aramix_h100 --temperature 0 --max_new_tokens 16
+python simple_qa_test_aramix_v3.py --repo_dir ./aramix_h100 --ckpt ./aramix_h100/model_best.pt
+python simple_qa_test_aramix_v3.py --repo_dir ./aramix_h100 --questions qa_questions.json
+"""
+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import os
+import re
+import sys
+import time
+import unicodedata
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+DEFAULT_QUESTIONS = [
+    {"category": "Géographie", "question": "Quelle est la capitale de la France ?", "reference": "Paris"},
+    {"category": "Géographie", "question": "Quel est le plus long fleuve d'Afrique ?", "reference": "Le Nil"},
+    {"category": "Science", "question": "Qu'est-ce que la photosynthèse ?", "reference": "Processus par lequel les plantes convertissent la lumière en énergie"},
+    {"category": "Science", "question": "Combien d'os compte le corps humain adulte ?", "reference": "206"},
+    {"category": "Histoire", "question": "En quelle année a eu lieu la Révolution française ?", "reference": "1789"},
+    {"category": "Histoire", "question": "Qui a écrit Les Misérables ?", "reference": "Victor Hugo"},
+    {"category": "Mathématiques", "question": "Quelle est la formule de l'aire d'un cercle ?", "reference": "pi r carre"},
+    {"category": "Langage", "question": "Donne un synonyme du mot heureux.", "reference": "joyeux"},
+    {"category": "Raisonnement", "question": "Si j'ai 5 pommes et j'en donne 2, combien m'en reste-t-il ?", "reference": "3"},
+    {"category": "Dialogue", "question": "Comment vas-tu aujourd'hui ?", "reference": None},
+]
+def load_module_from_file(py_path: Path):
+    spec = importlib.util.spec_from_file_location(py_path.stem, py_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Impossible de charger le module: {py_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[py_path.stem] = module
+    spec.loader.exec_module(module)
+    return module
+def normalize_text(text: str) -> str:
+    text = (text or "").strip().lower()
+    text = unicodedata.normalize("NFKD", text)
+    text = "".join(ch for ch in text if not unicodedata.combining(ch))
+    text = text.replace("π", "pi")
+    text = re.sub(r"[\W_]+", " ", text, flags=re.UNICODE)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def lexical_overlap(reference: Optional[str], answer: str) -> Optional[float]:
+    if not reference:
+        return None
+    ref = set(normalize_text(reference).split())
+    ans = set(normalize_text(answer).split())
+    if not ref:
+        return None
+    return len(ref & ans) / len(ref)
+def exact_match(reference: Optional[str], answer: str) -> Optional[bool]:
+    if not reference:
+        return None
+    return normalize_text(reference) == normalize_text(answer)
+def contains_reference(reference: Optional[str], answer: str) -> Optional[bool]:
+    if not reference:
+        return None
+    ref = normalize_text(reference)
+    ans = normalize_text(answer)
+    if not ref:
+        return None
+    return ref in ans
+def infer_repo_defaults(repo_dir: Path):
+    train_script = repo_dir.parent / "train_aramix_h100_full.py"
+    if not train_script.exists():
+        train_script = repo_dir / "train_aramix_h100_full.py"
+    ckpt = repo_dir / "model_best.pt"
+    if not ckpt.exists():
+        ckpt = repo_dir / "model.pt"
+    config = repo_dir / "config.json"
+    tokenizer_dir = repo_dir / "tokenizer_32k"
+    return train_script, ckpt, config, tokenizer_dir
+def safe_get(cfg: Dict[str, Any], *names: str, default=None):
+    for name in names:
+        if name in cfg:
+            return cfg[name]
+    return default
+def build_model_config_dict(cfg_json: Dict[str, Any], vocab_size: int) -> Dict[str, Any]:
+    block_size = safe_get(cfg_json, "block_size", "max_seq_len", "seq_len", default=512)
+    d_model = safe_get(cfg_json, "d_model", "n_embd", "dim", default=768)
+    n_heads = safe_get(cfg_json, "n_heads", "n_head", "num_heads", default=12)
+    n_layers = safe_get(cfg_json, "n_layers", "n_layer", "num_layers", default=12)
+    d_ff = safe_get(cfg_json, "d_ff", "ffn_dim", "intermediate_size", default=d_model * 4)
+    return {
+        "vocab_size": vocab_size,
+        "block_size": int(block_size),
+        "d_model": int(d_model),
+        "n_heads": int(n_heads),
+        "n_layers": int(n_layers),
+        "d_ff": int(d_ff),
+    }
+def strip_orig_mod_prefix(state: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    if any(k.startswith("_orig_mod.") for k in state.keys()):
+        return {k.replace("_orig_mod.", "", 1): v for k, v in state.items()}
+    return state
+def clean_answer_text(text: str, max_words: int = 16) -> str:
+    text = (text or "").strip()
+    # Retire quelques marqueurs fréquents
+    text = text.replace("<eos>", " ")
+    text = text.replace("</s>", " ")
+    text = text.replace("<pad>", " ")
+    text = text.replace("Réponse :", " ")
+    text = text.replace("Réponse:", " ")
+    text = text.replace("Answer:", " ")
+    # Garde la première ligne
+    text = text.split("\n")[0].strip()
+    # Coupe à la première vraie fin de phrase courte
+    m = re.search(r"([.!?])", text)
+    if m and m.start() < 120:
+        text = text[: m.start() + 1]
+    # Compacte les espaces
+    text = re.sub(r"\s+", " ", text).strip()
+    # Tronque au nombre de mots voulu
+    words = text.split()
+    if len(words) > max_words:
+        text = " ".join(words[:max_words]).strip()
+    # Supprime ponctuation finale excessive
+    text = re.sub(r"[,\s;:]+$", "", text).strip()
+    return text
+def safe_top_k_logits(logits: torch.Tensor, top_k: int) -> torch.Tensor:
+    if top_k is None or top_k <= 0:
+        return logits
+    k = min(int(top_k), logits.size(-1))
+    values, _ = torch.topk(logits, k=k)
+    kth = values[:, -1].unsqueeze(-1)
+    return torch.where(logits < kth, torch.full_like(logits, float("-inf")), logits)
+class AramixChatTester:
+    def __init__(
+        self,
+        repo_dir: Path,
+        train_script: Path,
+        ckpt_path: Path,
+        config_path: Path,
+        device: Optional[str] = None,
+        prompt_style: str = "strict_qa",
+    ):
+        self.repo_dir = repo_dir
+        self.train_script = train_script
+        self.ckpt_path = ckpt_path
+        self.config_path = config_path
+        self.prompt_style = prompt_style
+        self.device = torch.device(device if device is not None else ("cuda" if torch.cuda.is_available() else "cpu"))
+        self.M = load_module_from_file(self.train_script)
+        required = ["GPT", "GPTConfig", "train_or_load_tokenizer", "DOMAINS"]
+        missing = [x for x in required if not hasattr(self.M, x)]
+        if missing:
+            raise RuntimeError(f"Le fichier {self.train_script.name} ne contient pas les symboles attendus: {missing}")
+        self.cfg_json: Dict[str, Any] = {}
+        if self.config_path.exists():
+            with open(self.config_path, "r", encoding="utf-8") as f:
+                self.cfg_json = json.load(f)
+        self.tokenizer = self._load_tokenizer()
+        self.model = self._load_model()
+    def _load_tokenizer(self):
+        old_cwd = Path.cwd()
+        try:
+            os.chdir(self.repo_dir.parent)
+            tok = self.M.train_or_load_tokenizer(self.M.DOMAINS)
+        finally:
+            os.chdir(old_cwd)
+        return tok
+    def _make_gpt_config(self):
+        kwargs = build_model_config_dict(self.cfg_json, vocab_size=len(self.tokenizer))
+        try:
+            return self.M.GPTConfig(**kwargs)
+        except TypeError:
+            return self.M.GPTConfig(vocab_size=len(self.tokenizer))
+    def _manual_load_state(self, model: torch.nn.Module):
+        ckpt = torch.load(self.ckpt_path, map_location=self.device)
+        if isinstance(ckpt, dict) and "model" in ckpt:
+            state = ckpt["model"]
+        else:
+            state = ckpt
+        if not isinstance(state, dict):
+            raise RuntimeError("Checkpoint non reconnu: pas de state_dict exploitable.")
+        state = strip_orig_mod_prefix(state)
+        missing, unexpected = model.load_state_dict(state, strict=False)
+        # tolérance minimale uniquement sur le tying éventuel
+        hard_missing = [k for k in missing if not k.endswith("lm_head.weight")]
+        hard_unexpected = [k for k in unexpected if not k.startswith("_orig_mod.")]
+        if hard_missing or hard_unexpected:
+            raise RuntimeError(
+                "Chargement manuel incomplet.\n"
+                f"Missing: {hard_missing[:20]}\n"
+                f"Unexpected: {hard_unexpected[:20]}"
+            )
+    def _load_model(self):
+        cfg = self._make_gpt_config()
+        model = self.M.GPT(cfg).to(self.device)
+        if hasattr(self.M, "load_checkpoint"):
+            try:
+                try:
+                    self.M.load_checkpoint(model, None, self.ckpt_path, self.device)
+                    model.eval()
+                    return model
+                except TypeError:
+                    self.M.load_checkpoint(model, self.ckpt_path, self.device)
+                    model.eval()
+                    return model
+            except RuntimeError as e:
+                if "_orig_mod." not in str(e):
+                    raise
+        self._manual_load_state(model)
+        model.eval()
+        return model
+    def build_prompt(self, question: str) -> str:
+        if self.prompt_style == "strict_qa":
+            return (
+                "Réponds très brièvement et uniquement en français.\n"
+                "Donne seulement la réponse finale, sans explication.\n\n"
+                f"Question : {question}\n"
+                "Réponse :"
+            )
+        if self.prompt_style == "qa":
+            return f"Question: {question}\nRéponse:"
+        return question
+    def encode_prompt(self, question: str) -> List[int]:
+        bos = getattr(self.tokenizer, "bos_token_id", None)
+        eos = getattr(self.tokenizer, "eos_token_id", None)
+        prompt = self.build_prompt(question)
+        ids = self.tokenizer.encode(prompt, add_special_tokens=False)
+        if bos is not None:
+            ids = [bos] + ids
+        if eos is not None and ids and ids[-1] == eos:
+            ids = ids[:-1]
+        return ids
+    @torch.no_grad()
+    def generate(
+        self,
+        question: str,
+        max_new_tokens: int = 16,
+        temperature: float = 0.0,
+        top_k: int = 1,
+        repetition_penalty: float = 1.10,
+        max_answer_words: int = 16,
+    ) -> str:
+        ids = self.encode_prompt(question)
+        x = torch.tensor([ids], dtype=torch.long, device=self.device)
+        eos_id = getattr(self.tokenizer, "eos_token_id", None)
+        block_size = getattr(getattr(self.model, "cfg", None), "block_size", None)
+        if block_size is None:
+            block_size = safe_get(self.cfg_json, "block_size", "max_seq_len", default=512)
+        generated_word_budget_hit = False
+        for step in range(max_new_tokens):
+            x_ctx = x[:, -int(block_size):]
+            out = self.model(x_ctx)
+            logits = out[0] if isinstance(out, tuple) else out
+            logits = logits[:, -1, :]
+            # Pénalité légère de répétition
+            recent = x[0, -48:].tolist()
+            for tok in set(recent):
+                logits[0, tok] /= repetition_penalty
+            # Greedy par défaut pour QA courte
+            if temperature <= 0:
+                if top_k is not None and top_k > 1:
+                    masked = safe_top_k_logits(logits, top_k)
+                    probs = F.softmax(masked, dim=-1)
+                    next_tok = torch.multinomial(probs, num_samples=1)
+                else:
+                    next_tok = torch.argmax(logits, dim=-1, keepdim=True)
+            else:
+                logits = logits / max(temperature, 1e-5)
+                logits = safe_top_k_logits(logits, top_k)
+                probs = F.softmax(logits, dim=-1)
+                next_tok = torch.multinomial(probs, num_samples=1)
+            x = torch.cat([x, next_tok], dim=1)
+            if eos_id is not None and next_tok.item() == eos_id and step >= 1:
+                break
+            # arrêt anticipé si la réponse devient déjà trop longue
+            partial = self.tokenizer.decode(x[0, len(ids):].tolist()).strip()
+            partial = clean_answer_text(partial, max_words=max_answer_words)
+            if len(partial.split()) >= max_answer_words:
+                generated_word_budget_hit = True
+                break
+        new_ids = x[0, len(ids):].tolist()
+        text = self.tokenizer.decode(new_ids).strip()
+        text = clean_answer_text(text, max_words=max_answer_words)
+        if generated_word_budget_hit:
+            text = clean_answer_text(text, max_words=max_answer_words)
+        return text
+def load_questions(path: Optional[str]) -> List[Dict[str, Any]]:
+    if not path:
+        return DEFAULT_QUESTIONS
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        raise ValueError("Le fichier questions doit contenir une liste JSON.")
+    return data
+def format_bar(score: float, width: int = 20) -> str:
+    n = max(0, min(width, int(round(score * width))))
+    return "█" * n + "░" * (width - n)
+def save_reports(output_dir: Path, summary: Dict[str, Any]) -> Tuple[Path, Path]:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    json_path = output_dir / "qa_test_report_simple_v3.json"
+    txt_path = output_dir / "qa_test_report_simple_v3.txt"
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(summary, f, ensure_ascii=False, indent=2)
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write("TEST QA SIMPLE — MODÈLE DÉJÀ ENTRAÎNÉ (V3)\n")
+        f.write("=" * 60 + "\n\n")
+        for r in summary["results"]:
+            f.write(f"[{r['id']:02d}] {r['category']}\n")
+            f.write(f"  User      : {r['question']}\n")
+            f.write(f"  Assistant : {r['answer']}\n")
+            if r["reference"]:
+                f.write(f"  Référence : {r['reference']}\n")
+            if r["overlap_score"] is not None:
+                f.write(f"  Overlap   : {r['overlap_score']:.0%}\n")
+            if r["exact_match"] is not None:
+                f.write(f"  ExactMatch: {r['exact_match']}\n")
+            if r["contains_reference"] is not None:
+                f.write(f"  Contains  : {r['contains_reference']}\n")
+            f.write(f"  Latence   : {r['latency_s']}s\n\n")
+    return json_path, txt_path
+def main():
+    parser = argparse.ArgumentParser("Test QA simple et strict pour modèle Aramix déjà entraîné")
+    parser.add_argument("--repo_dir", type=str, default="./aramix_h100")
+    parser.add_argument("--train_script", type=str, default=None)
+    parser.add_argument("--ckpt", type=str, default=None)
+    parser.add_argument("--config", type=str, default=None)
+    parser.add_argument("--questions", type=str, default=None)
+    parser.add_argument("--max_new_tokens", type=int, default=16)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--top_k", type=int, default=1)
+    parser.add_argument("--repetition_penalty", type=float, default=1.10)
+    parser.add_argument("--max_answer_words", type=int, default=16)
+    parser.add_argument("--prompt_style", type=str, default="strict_qa", choices=["strict_qa", "qa", "raw"])
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--save_report", action="store_true")
+    args = parser.parse_args()
+    repo_dir = Path(args.repo_dir).resolve()
+    train_script, ckpt_path, config_path, tokenizer_dir = infer_repo_defaults(repo_dir)
+    if args.train_script:
+        train_script = Path(args.train_script).resolve()
+    if args.ckpt:
+        ckpt_path = Path(args.ckpt).resolve()
+    if args.config:
+        config_path = Path(args.config).resolve()
+    if not train_script.exists():
+        raise FileNotFoundError(f"Script train introuvable: {train_script}")
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint introuvable: {ckpt_path}")
+    if not config_path.exists():
+        print(f"[WARN] config.json introuvable: {config_path} — fallback sur GPTConfig(vocab_size=...).")
+    questions = load_questions(args.questions)
+    tester = AramixChatTester(
+        repo_dir=repo_dir,
+        train_script=train_script,
+        ckpt_path=ckpt_path,
+        config_path=config_path,
+        device=args.device,
+        prompt_style=args.prompt_style,
+    )
+    results: List[Dict[str, Any]] = []
+    categories: Dict[str, List[Dict[str, Any]]] = {}
+    print("\n" + "=" * 60)
+    print("TEST QA SIMPLE — MODÈLE DÉJÀ ENTRAÎNÉ")
+    print("=" * 60)
+    print(f"Repo        : {repo_dir}")
+    print(f"Train script: {train_script}")
+    print(f"Checkpoint  : {ckpt_path}")
+    print(f"Config      : {config_path}")
+    print(f"Tokenizer   : {tokenizer_dir}")
+    print(f"Device      : {tester.device}")
+    print(f"Prompt      : {args.prompt_style}")
+    print(f"Questions   : {len(questions)}")
+    print("=" * 60 + "\n")
+    for i, item in enumerate(questions, 1):
+        q = item["question"]
+        ref = item.get("reference")
+        cat = item.get("category", "Général")
+        t0 = time.time()
+        ans = tester.generate(
+            q,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            repetition_penalty=args.repetition_penalty,
+            max_answer_words=args.max_answer_words,
+        )
+        latency = time.time() - t0
+        overlap = lexical_overlap(ref, ans)
+        em = exact_match(ref, ans)
+        contains_ref = contains_reference(ref, ans)
+        entry = {
+            "id": i,
+            "category": cat,
+            "question": q,
+            "answer": ans,
+            "reference": ref,
+            "latency_s": round(latency, 3),
+            "tokens_generated_approx": len(ans.split()),
+            "overlap_score": None if overlap is None else round(overlap, 4),
+            "exact_match": em,
+            "contains_reference": contains_ref,
+        }
+        results.append(entry)
+        categories.setdefault(cat, []).append(entry)
+        overlap_str = f"{overlap:.0%}" if overlap is not None else "n/a"
+        em_str = "✓" if em else ("✗" if em is not None else "n/a")
+        contains_str = "✓" if contains_ref else ("✗" if contains_ref is not None else "n/a")
+        print(f"[{i:02d}] {cat}")
+        print(f"  User      : {q}")
+        print(f"  Assistant : {ans}")
+        if ref:
+            print(f"  Référence : {ref}")
+            print(f"  Overlap   : {overlap_str}")
+            print(f"  ExactMatch: {em_str}")
+            print(f"  Contains  : {contains_str}")
+        print(f"  Latence   : {latency:.3f}s\n")
+    scored_overlap = [r["overlap_score"] for r in results if r["overlap_score"] is not None]
+    scored_em = [r["exact_match"] for r in results if r["exact_match"] is not None]
+    scored_contains = [r["contains_reference"] for r in results if r["contains_reference"] is not None]
+    avg_overlap = sum(scored_overlap) / len(scored_overlap) if scored_overlap else 0.0
+    em_rate = sum(1 for x in scored_em if x) / len(scored_em) if scored_em else 0.0
+    contains_rate = sum(1 for x in scored_contains if x) / len(scored_contains) if scored_contains else 0.0
+    avg_latency = sum(r["latency_s"] for r in results) / len(results) if results else 0.0
+    avg_words = sum(r["tokens_generated_approx"] for r in results) / len(results) if results else 0.0
+    cat_scores: Dict[str, float] = {}
+    for cat, items in categories.items():
+        vals = [r["overlap_score"] for r in items if r["overlap_score"] is not None]
+        cat_scores[cat] = (sum(vals) / len(vals)) if vals else 0.0
+    summary = {
+        "repo_dir": str(repo_dir),
+        "train_script": str(train_script),
+        "checkpoint": str(ckpt_path),
+        "config_path": str(config_path),
+        "tokenizer_dir": str(tokenizer_dir),
+        "device": str(tester.device),
+        "prompt_style": args.prompt_style,
+        "total_questions": len(results),
+        "avg_overlap_score": round(avg_overlap, 4),
+        "exact_match_rate": round(em_rate, 4),
+        "contains_reference_rate": round(contains_rate, 4),
+        "avg_latency_s": round(avg_latency, 3),
+        "avg_words_generated": round(avg_words, 1),
+        "scores_by_category": {k: round(v, 4) for k, v in cat_scores.items()},
+        "results": results,
+    }
+    print("=" * 60)
+    print("RÉSUMÉ")
+    print("=" * 60)
+    print(f"Questions testées : {len(results)}")
+    print(f"Overlap moyen     : {avg_overlap:.1%}")
+    print(f"Exact match       : {em_rate:.1%}")
+    print(f"Contains ref      : {contains_rate:.1%}")
+    print(f"Latence moyenne   : {avg_latency:.2f}s")
+    print(f"Mots moyens       : {avg_words:.1f}")
+    print("Scores / catégorie:")
+    for cat, score in sorted(cat_scores.items()):
+        print(f"  {cat:<15} {format_bar(score)} {score:.0%}")
+    print("=" * 60)
+    if args.save_report:
+        json_path, txt_path = save_reports(repo_dir, summary)
+        print(f"Rapports sauvegardés dans : {json_path}")
+        print(f"                           {txt_path}")
+if __name__ == "__main__":
+    main()

simple_qa_test_finished_model (1).py ADDED Viewed

	@@ -0,0 +1,309 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Test QA simple pour un modèle déjà entraîné.
+Fonctionne avec le script : train_chatbot_100m_large.py
+- charge la config depuis output_dir/train_config.json si disponible
+- charge un checkpoint fini (par défaut: output_dir/sft_best.pt)
+- pose une petite liste de questions QA
+- calcule un score simple d'overlap lexical
+- sauvegarde un rapport JSON + TXT
+Exemples
+--------
+python simple_qa_test_finished_model.py --output_dir ./fr_100m
+python simple_qa_test_finished_model.py --output_dir ./fr_100m --ckpt ./fr_100m/sft_final.pt
+python simple_qa_test_finished_model.py --output_dir ./fr_100m --questions qa_questions.json
+"""
+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import os
+import sys
+import time
+import unicodedata
+import re
+from pathlib import Path
+from typing import Dict, List, Optional
+DEFAULT_QUESTIONS = [
+    {
+        "category": "Géographie",
+        "question": "Quelle est la capitale de la France ?",
+        "reference": "Paris",
+    },
+    {
+        "category": "Géographie",
+        "question": "Quel est le plus long fleuve d'Afrique ?",
+        "reference": "Le Nil",
+    },
+    {
+        "category": "Science",
+        "question": "Qu'est-ce que la photosynthèse ?",
+        "reference": "Processus par lequel les plantes convertissent la lumière en énergie",
+    },
+    {
+        "category": "Science",
+        "question": "Combien d'os compte le corps humain adulte ?",
+        "reference": "206",
+    },
+    {
+        "category": "Histoire",
+        "question": "En quelle année a eu lieu la Révolution française ?",
+        "reference": "1789",
+    },
+    {
+        "category": "Histoire",
+        "question": "Qui a écrit Les Misérables ?",
+        "reference": "Victor Hugo",
+    },
+    {
+        "category": "Mathématiques",
+        "question": "Quelle est la formule de l'aire d'un cercle ?",
+        "reference": "π × r²",
+    },
+    {
+        "category": "Langage",
+        "question": "Donne un synonyme du mot heureux.",
+        "reference": "joyeux",
+    },
+    {
+        "category": "Raisonnement",
+        "question": "Si j'ai 5 pommes et j'en donne 2, combien m'en reste-t-il ?",
+        "reference": "3",
+    },
+    {
+        "category": "Dialogue",
+        "question": "Comment vas-tu aujourd'hui ?",
+        "reference": None,
+    },
+]
+def normalize_text(s: str) -> str:
+    s = (s or "").strip().lower()
+    s = unicodedata.normalize("NFKD", s)
+    s = "".join(ch for ch in s if not unicodedata.combining(ch))
+    s = re.sub(r"[^\w\s]", " ", s, flags=re.UNICODE)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def lexical_overlap(reference: Optional[str], answer: str) -> Optional[float]:
+    if not reference:
+        return None
+    ref_tokens = set(normalize_text(reference).split())
+    ans_tokens = set(normalize_text(answer).split())
+    if not ref_tokens:
+        return 0.0
+    return len(ref_tokens & ans_tokens) / len(ref_tokens)
+def exact_match(reference: Optional[str], answer: str) -> Optional[bool]:
+    if not reference:
+        return None
+    return normalize_text(reference) == normalize_text(answer)
+def import_train_module(train_script_path: str):
+    path = Path(train_script_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Script d'entraînement introuvable: {path}")
+    spec = importlib.util.spec_from_file_location("train_module", str(path))
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Impossible de charger le module: {path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["train_module"] = module
+    spec.loader.exec_module(module)
+    return module
+def build_cfg(train_module, output_dir: str):
+    cfg_path = Path(output_dir) / "train_config.json"
+    if cfg_path.exists():
+        with open(cfg_path, "r", encoding="utf-8") as f:
+            saved = json.load(f)
+        cfg = train_module.TrainConfig(**saved)
+    else:
+        cfg = train_module.TrainConfig(output_dir=output_dir, tokenizer_prefix=f"{output_dir}/tokenizer")
+    cfg.output_dir = output_dir
+    cfg.tokenizer_prefix = f"{output_dir}/tokenizer"
+    return cfg
+def run_test(
+    train_script: str,
+    output_dir: str,
+    ckpt_path: str,
+    questions: List[Dict],
+    save_report: bool,
+):
+    train_module = import_train_module(train_script)
+    cfg = build_cfg(train_module, output_dir)
+    bot = train_module.Chatbot(cfg, ckpt_path)
+    results = []
+    categories: Dict[str, List[Dict]] = {}
+    sep = "─" * 64
+    print(f"\n{'═'*64}")
+    print(" TEST QA SIMPLE — MODÈLE ENTRAÎNÉ")
+    print(f" Checkpoint : {ckpt_path}")
+    print(f" Questions  : {len(questions)}")
+    print(f"{'═'*64}\n")
+    for i, item in enumerate(questions, 1):
+        q = item["question"]
+        ref = item.get("reference")
+        cat = item.get("category", "Général")
+        ctx = item.get("context", "")
+        t0 = time.time()
+        ans = bot.chat(q, context=ctx)
+        latency = time.time() - t0
+        overlap = lexical_overlap(ref, ans)
+        em = exact_match(ref, ans)
+        row = {
+            "id": i,
+            "category": cat,
+            "question": q,
+            "context": ctx,
+            "reference": ref,
+            "answer": ans,
+            "overlap_score": overlap,
+            "exact_match": em,
+            "latency_s": round(latency, 3),
+            "tokens_generated_approx": len(ans.split()),
+        }
+        results.append(row)
+        categories.setdefault(cat, []).append(row)
+        score_text = []
+        if overlap is not None:
+            score_text.append(f"overlap={overlap:.0%}")
+        if em is not None:
+            score_text.append(f"EM={'oui' if em else 'non'}")
+        score_str = f" [{' | '.join(score_text)}]" if score_text else ""
+        print(sep)
+        print(f"[{i:02d}] [{cat}]{score_str}")
+        if ctx:
+            print(f"  Contexte  : {ctx[:120]}{'...' if len(ctx) > 120 else ''}")
+        print(f"  User      : {q}")
+        print(f"  Assistant : {ans}")
+        if ref:
+            print(f"  Référence : {ref}")
+        print(f"  ⏱  {latency:.2f}s | ~{row['tokens_generated_approx']} mots\n")
+    scored = [r for r in results if r["overlap_score"] is not None]
+    avg_overlap = sum(r["overlap_score"] for r in scored) / len(scored) if scored else 0.0
+    em_rows = [r for r in results if r["exact_match"] is not None]
+    em_rate = sum(1 for r in em_rows if r["exact_match"]) / len(em_rows) if em_rows else 0.0
+    avg_latency = sum(r["latency_s"] for r in results) / max(1, len(results))
+    avg_tokens = sum(r["tokens_generated_approx"] for r in results) / max(1, len(results))
+    scores_by_category = {}
+    for cat, items in categories.items():
+        cat_scored = [x for x in items if x["overlap_score"] is not None]
+        cat_em = [x for x in items if x["exact_match"] is not None]
+        scores_by_category[cat] = {
+            "avg_overlap": round(sum(x["overlap_score"] for x in cat_scored) / len(cat_scored), 4) if cat_scored else None,
+            "exact_match_rate": round(sum(1 for x in cat_em if x["exact_match"]) / len(cat_em), 4) if cat_em else None,
+        }
+    summary = {
+        "checkpoint": ckpt_path,
+        "total_questions": len(results),
+        "avg_overlap_score": round(avg_overlap, 4),
+        "exact_match_rate": round(em_rate, 4),
+        "avg_latency_s": round(avg_latency, 3),
+        "avg_tokens_generated_approx": round(avg_tokens, 1),
+        "scores_by_category": scores_by_category,
+        "results": results,
+    }
+    print(f"{'═'*64}")
+    print(" RÉSUMÉ")
+    print(f"{'═'*64}")
+    print(f" Questions testées : {len(results)}")
+    print(f" Overlap moyen     : {avg_overlap:.1%}")
+    print(f" Exact match       : {em_rate:.1%}")
+    print(f" Latence moyenne   : {avg_latency:.2f}s")
+    print(f" Mots moyens       : {avg_tokens:.0f}")
+    print(" Scores / catégorie :")
+    for cat, sc in scores_by_category.items():
+        ov = sc["avg_overlap"]
+        emc = sc["exact_match_rate"]
+        print(f"   - {cat:<15} overlap={ov if ov is not None else 'n/a'} | EM={emc if emc is not None else 'n/a'}")
+    print(f"{'═'*64}\n")
+    if save_report:
+        report_json = Path(output_dir) / "qa_test_simple_report.json"
+        report_txt = Path(output_dir) / "qa_test_simple_report.txt"
+        with open(report_json, "w", encoding="utf-8") as f:
+            json.dump(summary, f, ensure_ascii=False, indent=2)
+        with open(report_txt, "w", encoding="utf-8") as f:
+            f.write("TEST QA SIMPLE — MODÈLE ENTRAÎNÉ\n")
+            f.write(f"Checkpoint : {ckpt_path}\n\n")
+            for r in results:
+                f.write(f"[{r['id']:02d}] {r['category']}\n")
+                if r["context"]:
+                    f.write(f"  Contexte  : {r['context']}\n")
+                f.write(f"  User      : {r['question']}\n")
+                f.write(f"  Assistant : {r['answer']}\n")
+                if r["reference"]:
+                    f.write(f"  Référence : {r['reference']}\n")
+                if r["overlap_score"] is not None:
+                    f.write(f"  Overlap   : {r['overlap_score']:.0%}\n")
+                if r["exact_match"] is not None:
+                    f.write(f"  EM        : {'oui' if r['exact_match'] else 'non'}\n")
+                f.write(f"  Latence   : {r['latency_s']}s\n\n")
+        print(f"Rapport JSON -> {report_json}")
+        print(f"Rapport TXT  -> {report_txt}")
+    return summary
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Test QA simple pour modèle déjà entraîné")
+    parser.add_argument("--train_script", type=str, default="./train_chatbot_100m_large.py")
+    parser.add_argument("--output_dir", type=str, default="./fr_100m")
+    parser.add_argument("--ckpt", type=str, default=None)
+    parser.add_argument("--questions", type=str, default=None, help="JSON optionnel [{question, reference, category, context?}]")
+    parser.add_argument("--no_save", action="store_true")
+    args = parser.parse_args()
+    ckpt_path = args.ckpt or os.path.join(args.output_dir, "sft_best.pt")
+    if not Path(ckpt_path).exists():
+        raise FileNotFoundError(
+            f"Checkpoint introuvable: {ckpt_path}\n"
+            f"Exemple: --ckpt {args.output_dir}/sft_final.pt"
+        )
+    if args.questions:
+        with open(args.questions, "r", encoding="utf-8") as f:
+            questions = json.load(f)
+    else:
+        questions = DEFAULT_QUESTIONS
+    run_test(
+        train_script=args.train_script,
+        output_dir=args.output_dir,
+        ckpt_path=ckpt_path,
+        questions=questions,
+        save_report=not args.no_save,
+    )

simple_qa_test_finished_model.py ADDED Viewed

	@@ -0,0 +1,309 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Test QA simple pour un modèle déjà entraîné.
+Fonctionne avec le script : train_chatbot_100m_large.py
+- charge la config depuis output_dir/train_config.json si disponible
+- charge un checkpoint fini (par défaut: output_dir/sft_best.pt)
+- pose une petite liste de questions QA
+- calcule un score simple d'overlap lexical
+- sauvegarde un rapport JSON + TXT
+Exemples
+--------
+python simple_qa_test_finished_model.py --output_dir ./fr_100m
+python simple_qa_test_finished_model.py --output_dir ./fr_100m --ckpt ./fr_100m/sft_final.pt
+python simple_qa_test_finished_model.py --output_dir ./fr_100m --questions qa_questions.json
+"""
+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import os
+import sys
+import time
+import unicodedata
+import re
+from pathlib import Path
+from typing import Dict, List, Optional
+DEFAULT_QUESTIONS = [
+    {
+        "category": "Géographie",
+        "question": "Quelle est la capitale de la France ?",
+        "reference": "Paris",
+    },
+    {
+        "category": "Géographie",
+        "question": "Quel est le plus long fleuve d'Afrique ?",
+        "reference": "Le Nil",
+    },
+    {
+        "category": "Science",
+        "question": "Qu'est-ce que la photosynthèse ?",
+        "reference": "Processus par lequel les plantes convertissent la lumière en énergie",
+    },
+    {
+        "category": "Science",
+        "question": "Combien d'os compte le corps humain adulte ?",
+        "reference": "206",
+    },
+    {
+        "category": "Histoire",
+        "question": "En quelle année a eu lieu la Révolution française ?",
+        "reference": "1789",
+    },
+    {
+        "category": "Histoire",
+        "question": "Qui a écrit Les Misérables ?",
+        "reference": "Victor Hugo",
+    },
+    {
+        "category": "Mathématiques",
+        "question": "Quelle est la formule de l'aire d'un cercle ?",
+        "reference": "π × r²",
+    },
+    {
+        "category": "Langage",
+        "question": "Donne un synonyme du mot heureux.",
+        "reference": "joyeux",
+    },
+    {
+        "category": "Raisonnement",
+        "question": "Si j'ai 5 pommes et j'en donne 2, combien m'en reste-t-il ?",
+        "reference": "3",
+    },
+    {
+        "category": "Dialogue",
+        "question": "Comment vas-tu aujourd'hui ?",
+        "reference": None,
+    },
+]
+def normalize_text(s: str) -> str:
+    s = (s or "").strip().lower()
+    s = unicodedata.normalize("NFKD", s)
+    s = "".join(ch for ch in s if not unicodedata.combining(ch))
+    s = re.sub(r"[^\w\s]", " ", s, flags=re.UNICODE)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+def lexical_overlap(reference: Optional[str], answer: str) -> Optional[float]:
+    if not reference:
+        return None
+    ref_tokens = set(normalize_text(reference).split())
+    ans_tokens = set(normalize_text(answer).split())
+    if not ref_tokens:
+        return 0.0
+    return len(ref_tokens & ans_tokens) / len(ref_tokens)
+def exact_match(reference: Optional[str], answer: str) -> Optional[bool]:
+    if not reference:
+        return None
+    return normalize_text(reference) == normalize_text(answer)
+def import_train_module(train_script_path: str):
+    path = Path(train_script_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Script d'entraînement introuvable: {path}")
+    spec = importlib.util.spec_from_file_location("train_module", str(path))
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Impossible de charger le module: {path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["train_module"] = module
+    spec.loader.exec_module(module)
+    return module
+def build_cfg(train_module, output_dir: str):
+    cfg_path = Path(output_dir) / "train_config.json"
+    if cfg_path.exists():
+        with open(cfg_path, "r", encoding="utf-8") as f:
+            saved = json.load(f)
+        cfg = train_module.TrainConfig(**saved)
+    else:
+        cfg = train_module.TrainConfig(output_dir=output_dir, tokenizer_prefix=f"{output_dir}/tokenizer")
+    cfg.output_dir = output_dir
+    cfg.tokenizer_prefix = f"{output_dir}/tokenizer"
+    return cfg
+def run_test(
+    train_script: str,
+    output_dir: str,
+    ckpt_path: str,
+    questions: List[Dict],
+    save_report: bool,
+):
+    train_module = import_train_module(train_script)
+    cfg = build_cfg(train_module, output_dir)
+    bot = train_module.Chatbot(cfg, ckpt_path)
+    results = []
+    categories: Dict[str, List[Dict]] = {}
+    sep = "─" * 64
+    print(f"\n{'═'*64}")
+    print(" TEST QA SIMPLE — MODÈLE ENTRAÎNÉ")
+    print(f" Checkpoint : {ckpt_path}")
+    print(f" Questions  : {len(questions)}")
+    print(f"{'═'*64}\n")
+    for i, item in enumerate(questions, 1):
+        q = item["question"]
+        ref = item.get("reference")
+        cat = item.get("category", "Général")
+        ctx = item.get("context", "")
+        t0 = time.time()
+        ans = bot.chat(q, context=ctx)
+        latency = time.time() - t0
+        overlap = lexical_overlap(ref, ans)
+        em = exact_match(ref, ans)
+        row = {
+            "id": i,
+            "category": cat,
+            "question": q,
+            "context": ctx,
+            "reference": ref,
+            "answer": ans,
+            "overlap_score": overlap,
+            "exact_match": em,
+            "latency_s": round(latency, 3),
+            "tokens_generated_approx": len(ans.split()),
+        }
+        results.append(row)
+        categories.setdefault(cat, []).append(row)
+        score_text = []
+        if overlap is not None:
+            score_text.append(f"overlap={overlap:.0%}")
+        if em is not None:
+            score_text.append(f"EM={'oui' if em else 'non'}")
+        score_str = f" [{' | '.join(score_text)}]" if score_text else ""
+        print(sep)
+        print(f"[{i:02d}] [{cat}]{score_str}")
+        if ctx:
+            print(f"  Contexte  : {ctx[:120]}{'...' if len(ctx) > 120 else ''}")
+        print(f"  User      : {q}")
+        print(f"  Assistant : {ans}")
+        if ref:
+            print(f"  Référence : {ref}")
+        print(f"  ⏱  {latency:.2f}s | ~{row['tokens_generated_approx']} mots\n")
+    scored = [r for r in results if r["overlap_score"] is not None]
+    avg_overlap = sum(r["overlap_score"] for r in scored) / len(scored) if scored else 0.0
+    em_rows = [r for r in results if r["exact_match"] is not None]
+    em_rate = sum(1 for r in em_rows if r["exact_match"]) / len(em_rows) if em_rows else 0.0
+    avg_latency = sum(r["latency_s"] for r in results) / max(1, len(results))
+    avg_tokens = sum(r["tokens_generated_approx"] for r in results) / max(1, len(results))
+    scores_by_category = {}
+    for cat, items in categories.items():
+        cat_scored = [x for x in items if x["overlap_score"] is not None]
+        cat_em = [x for x in items if x["exact_match"] is not None]
+        scores_by_category[cat] = {
+            "avg_overlap": round(sum(x["overlap_score"] for x in cat_scored) / len(cat_scored), 4) if cat_scored else None,
+            "exact_match_rate": round(sum(1 for x in cat_em if x["exact_match"]) / len(cat_em), 4) if cat_em else None,
+        }
+    summary = {
+        "checkpoint": ckpt_path,
+        "total_questions": len(results),
+        "avg_overlap_score": round(avg_overlap, 4),
+        "exact_match_rate": round(em_rate, 4),
+        "avg_latency_s": round(avg_latency, 3),
+        "avg_tokens_generated_approx": round(avg_tokens, 1),
+        "scores_by_category": scores_by_category,
+        "results": results,
+    }
+    print(f"{'═'*64}")
+    print(" RÉSUMÉ")
+    print(f"{'═'*64}")
+    print(f" Questions testées : {len(results)}")
+    print(f" Overlap moyen     : {avg_overlap:.1%}")
+    print(f" Exact match       : {em_rate:.1%}")
+    print(f" Latence moyenne   : {avg_latency:.2f}s")
+    print(f" Mots moyens       : {avg_tokens:.0f}")
+    print(" Scores / catégorie :")
+    for cat, sc in scores_by_category.items():
+        ov = sc["avg_overlap"]
+        emc = sc["exact_match_rate"]
+        print(f"   - {cat:<15} overlap={ov if ov is not None else 'n/a'} | EM={emc if emc is not None else 'n/a'}")
+    print(f"{'═'*64}\n")
+    if save_report:
+        report_json = Path(output_dir) / "qa_test_simple_report.json"
+        report_txt = Path(output_dir) / "qa_test_simple_report.txt"
+        with open(report_json, "w", encoding="utf-8") as f:
+            json.dump(summary, f, ensure_ascii=False, indent=2)
+        with open(report_txt, "w", encoding="utf-8") as f:
+            f.write("TEST QA SIMPLE — MODÈLE ENTRAÎNÉ\n")
+            f.write(f"Checkpoint : {ckpt_path}\n\n")
+            for r in results:
+                f.write(f"[{r['id']:02d}] {r['category']}\n")
+                if r["context"]:
+                    f.write(f"  Contexte  : {r['context']}\n")
+                f.write(f"  User      : {r['question']}\n")
+                f.write(f"  Assistant : {r['answer']}\n")
+                if r["reference"]:
+                    f.write(f"  Référence : {r['reference']}\n")
+                if r["overlap_score"] is not None:
+                    f.write(f"  Overlap   : {r['overlap_score']:.0%}\n")
+                if r["exact_match"] is not None:
+                    f.write(f"  EM        : {'oui' if r['exact_match'] else 'non'}\n")
+                f.write(f"  Latence   : {r['latency_s']}s\n\n")
+        print(f"Rapport JSON -> {report_json}")
+        print(f"Rapport TXT  -> {report_txt}")
+    return summary
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Test QA simple pour modèle déjà entraîné")
+    parser.add_argument("--train_script", type=str, default="./train_chatbot_100m_large.py")
+    parser.add_argument("--output_dir", type=str, default="./fr_100m")
+    parser.add_argument("--ckpt", type=str, default=None)
+    parser.add_argument("--questions", type=str, default=None, help="JSON optionnel [{question, reference, category, context?}]")
+    parser.add_argument("--no_save", action="store_true")
+    args = parser.parse_args()
+    ckpt_path = args.ckpt or os.path.join(args.output_dir, "sft_best.pt")
+    if not Path(ckpt_path).exists():
+        raise FileNotFoundError(
+            f"Checkpoint introuvable: {ckpt_path}\n"
+            f"Exemple: --ckpt {args.output_dir}/sft_final.pt"
+        )
+    if args.questions:
+        with open(args.questions, "r", encoding="utf-8") as f:
+            questions = json.load(f)
+    else:
+        questions = DEFAULT_QUESTIONS
+    run_test(
+        train_script=args.train_script,
+        output_dir=args.output_dir,
+        ckpt_path=ckpt_path,
+        questions=questions,
+        save_report=not args.no_save,
+    )

test.py ADDED Viewed

	@@ -0,0 +1,428 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import argparse
+import json
+from collections import OrderedDict
+from contextlib import nullcontext
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedTokenizerFast
+# ============================================================
+# Paths par défaut
+# ============================================================
+MODEL_DIR = Path("./nlp_1b_wiki_en_fr_ar")
+DEFAULT_CHECKPOINT = MODEL_DIR / "model_best.pt"
+DEFAULT_CONFIG = MODEL_DIR / "config.json"
+DEFAULT_TOKENIZER_DIR = MODEL_DIR / "tokenizer_32k"
+# ============================================================
+# Utils
+# ============================================================
+def get_device() -> torch.device:
+    if torch.cuda.is_available():
+        return torch.device(f"cuda:{torch.cuda.current_device()}")
+    return torch.device("cpu")
+def autocast_context(device: torch.device):
+    if device.type == "cuda":
+        return torch.autocast("cuda", dtype=torch.bfloat16)
+    return nullcontext()
+def normalize_state_dict_keys(state_dict: dict) -> OrderedDict:
+    normalized = OrderedDict()
+    for k, v in state_dict.items():
+        nk = k
+        if nk.startswith("module._orig_mod."):
+            nk = nk[len("module._orig_mod."):]
+        elif nk.startswith("_orig_mod."):
+            nk = nk[len("_orig_mod."):]
+        elif nk.startswith("module."):
+            nk = nk[len("module."):]
+        normalized[nk] = v
+    return normalized
+def postprocess_text(text: str) -> str:
+    return text.strip()
+# ============================================================
+# Architecture
+# ============================================================
+@dataclass
+class GPTConfig:
+    vocab_size: int
+    block_size: int
+    d_model: int
+    n_heads: int
+    n_layers: int
+    d_ff: int
+    dropout: float = 0.0
+    use_checkpointing: bool = False
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.weight * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: int = 10000, max_seq: int = 4096):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(max_seq).float()
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer(
+            "cos_cache",
+            torch.repeat_interleave(freqs.cos(), 2, dim=-1),
+            persistent=False,
+        )
+        self.register_buffer(
+            "sin_cache",
+            torch.repeat_interleave(freqs.sin(), 2, dim=-1),
+            persistent=False,
+        )
+    def forward(self, seq_len: int, dtype: torch.dtype):
+        return self.cos_cache[:seq_len].to(dtype), self.sin_cache[:seq_len].to(dtype)
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    return x * cos + rotate_half(x) * sin
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        assert cfg.d_model % cfg.n_heads == 0
+        self.n_heads = cfg.n_heads
+        self.head_dim = cfg.d_model // cfg.n_heads
+        self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.rope = RotaryEmbedding(self.head_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, t, c = x.shape
+        q, k, v = self.qkv(x).split(c, dim=-1)
+        q = q.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rope(t, x.dtype)
+        q = apply_rope(q, cos, sin)
+        k = apply_rope(k, cos, sin)
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            dropout_p=0.0,
+            is_causal=True,
+        )
+        y = y.transpose(1, 2).contiguous().view(b, t, c)
+        return self.proj(y)
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.w1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w2 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w3 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class Block(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.ln1 = RMSNorm(cfg.d_model)
+        self.attn = CausalSelfAttention(cfg)
+        self.ln2 = RMSNorm(cfg.d_model)
+        self.ff = SwiGLU(cfg)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ff(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.ln_f = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        self.lm_head.weight = self.tok_emb.weight
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        x = self.tok_emb(input_ids)
+        for block in self.blocks:
+            x = block(x)
+        return self.lm_head(self.ln_f(x))
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int = 160,
+        temperature: float = 0.8,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        repetition_penalty: float = 1.05,
+        eos_token_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        self.eval()
+        for _ in range(max_new_tokens):
+            idx_cond = input_ids[:, -self.cfg.block_size :]
+            logits = self(idx_cond)
+            logits = logits[:, -1, :]
+            if repetition_penalty != 1.0:
+                for b in range(input_ids.size(0)):
+                    seen = torch.unique(input_ids[b])
+                    logits[b, seen] /= repetition_penalty
+            if temperature <= 0:
+                next_token = torch.argmax(logits, dim=-1, keepdim=True)
+            else:
+                logits = logits / temperature
+                if top_k > 0:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float("inf")
+                if 0 < top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    probs = F.softmax(sorted_logits, dim=-1)
+                    cumulative_probs = torch.cumsum(probs, dim=-1)
+                    sorted_mask = cumulative_probs > top_p
+                    sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
+                    sorted_mask[..., 0] = False
+                    mask = torch.zeros_like(logits, dtype=torch.bool)
+                    mask.scatter_(1, sorted_indices, sorted_mask)
+                    logits = logits.masked_fill(mask, -float("inf"))
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            input_ids = torch.cat([input_ids, next_token], dim=1)
+            if eos_token_id is not None and (next_token == eos_token_id).all():
+                break
+        return input_ids
+# ============================================================
+# Load / generate
+# ============================================================
+def load_model_and_tokenizer(
+    checkpoint_path: Path,
+    config_path: Path,
+    tokenizer_dir: Path,
+    device: torch.device,
+    use_compile: bool = False,
+):
+    if not checkpoint_path.exists():
+        raise FileNotFoundError(f"Checkpoint introuvable: {checkpoint_path}")
+    if not config_path.exists():
+        raise FileNotFoundError(f"Config introuvable: {config_path}")
+    if not tokenizer_dir.exists():
+        raise FileNotFoundError(f"Tokenizer introuvable: {tokenizer_dir}")
+    cfg_dict = json.loads(config_path.read_text(encoding="utf-8"))
+    cfg = GPTConfig(**cfg_dict)
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(str(tokenizer_dir))
+    model = GPT(cfg).to(device)
+    ckpt = torch.load(checkpoint_path, map_location=device)
+    state_dict = normalize_state_dict_keys(ckpt["model"])
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+    if use_compile and hasattr(torch, "compile"):
+        model = torch.compile(model, mode="default")
+    return model, tokenizer, ckpt
+def generate_text(
+    model: GPT,
+    tokenizer: PreTrainedTokenizerFast,
+    prompt: str,
+    device: torch.device,
+    max_new_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    repetition_penalty: float,
+) -> str:
+    encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
+    input_ids = encoded["input_ids"].to(device)
+    if tokenizer.bos_token_id is not None:
+        bos = torch.tensor([[tokenizer.bos_token_id]], device=device, dtype=input_ids.dtype)
+        input_ids = torch.cat([bos, input_ids], dim=1)
+    prompt_len = input_ids.shape[1]
+    with autocast_context(device):
+        output_ids = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    generated_ids = output_ids[0][prompt_len:]
+    text = tokenizer.decode(generated_ids, skip_special_tokens=True)
+    return postprocess_text(text)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint", type=str, default=str(DEFAULT_CHECKPOINT))
+    parser.add_argument("--config", type=str, default=str(DEFAULT_CONFIG))
+    parser.add_argument("--tokenizer_dir", type=str, default=str(DEFAULT_TOKENIZER_DIR))
+    parser.add_argument("--prompt", type=str, default="Wikipedia is a free online encyclopedia")
+    parser.add_argument("--max_new_tokens", type=int, default=160)
+    parser.add_argument("--temperature", type=float, default=0.8)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--top_p", type=float, default=0.95)
+    parser.add_argument("--repetition_penalty", type=float, default=1.05)
+    parser.add_argument("--interactive", action="store_true")
+    parser.add_argument("--show_examples", action="store_true")
+    parser.add_argument("--compile", action="store_true")
+    args = parser.parse_args()
+    device = get_device()
+    if device.type == "cuda":
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.set_float32_matmul_precision("high")
+    model, tokenizer, ckpt = load_model_and_tokenizer(
+        checkpoint_path=Path(args.checkpoint),
+        config_path=Path(args.config),
+        tokenizer_dir=Path(args.tokenizer_dir),
+        device=device,
+        use_compile=args.compile,
+    )
+    print(f"Device: {device}")
+    print(f"Checkpoint: {args.checkpoint}")
+    print(f"epoch={ckpt.get('epoch', 'N/A')} | step={ckpt.get('step', 'N/A')} | best_loss={ckpt.get('best_loss', 'N/A')}")
+    if args.show_examples:
+        examples = [
+            "Wikipedia is a free online encyclopedia",
+            "La France est un pays d'Europe",
+            "الزراعة من أهم القطاعات الاقتصادية",
+            "Machine learning is a field of artificial intelligence",
+        ]
+        for ex in examples:
+            print("\n--- Prompt ---")
+            print(ex)
+            print("\n--- Output ---")
+            print(
+                generate_text(
+                    model=model,
+                    tokenizer=tokenizer,
+                    prompt=ex,
+                    device=device,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    repetition_penalty=args.repetition_penalty,
+                )
+            )
+        return
+    if args.interactive:
+        print("Mode interactif. Tape 'exit' pour quitter.\n")
+        while True:
+            prompt = input("Prompt> ").strip()
+            if prompt.lower() in {"exit", "quit"}:
+                break
+            if not prompt:
+                continue
+            print("\n=== Output ===")
+            print(
+                generate_text(
+                    model=model,
+                    tokenizer=tokenizer,
+                    prompt=prompt,
+                    device=device,
+                    max_new_tokens=args.max_new_tokens,
+                    temperature=args.temperature,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    repetition_penalty=args.repetition_penalty,
+                )
+            )
+            print()
+        return
+    print(
+        generate_text(
+            model=model,
+            tokenizer=tokenizer,
+            prompt=args.prompt,
+            device=device,
+            max_new_tokens=args.max_new_tokens,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            repetition_penalty=args.repetition_penalty,
+        )
+    )
+if __name__ == "__main__":
+    main()

top_p ADDED Viewed

File without changes

train.py ADDED Viewed

	@@ -0,0 +1,859 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import json
+import math
+import os
+import random
+import time
+from collections import OrderedDict
+from contextlib import nullcontext
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterator, Optional
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+from datasets import load_dataset
+from transformers import PreTrainedTokenizerFast
+# ============================================================
+# Base model / tokenizer / config
+# ============================================================
+BASE_CHECKPOINT = Path("./wikipedia_ar_h100_codealpaca/model_best.pt")
+BASE_TOKENIZER_DIR = Path("./wikipedia_ar_h100/tokenizer_32k")
+BASE_CONFIG_FILE = Path("./wikipedia_ar_h100/config.json")
+OUT_DIR = Path("./wikipedia_ar_h100_multicode_10x2000")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+MODEL_FILE = OUT_DIR / "model.pt"
+BEST_MODEL_FILE = OUT_DIR / "model_best.pt"
+STATE_FILE = OUT_DIR / "train_state.pt"
+CONFIG_FILE = OUT_DIR / "config.json"
+# ============================================================
+# Datasets
+# ============================================================
+TRAIN_SOURCES = [
+    {
+        "name": "HuggingFaceH4/CodeAlpaca_20K",
+        "subset": None,
+        "split": "train",
+        "kind": "codealpaca",
+        "weight": 0.45,
+        "streaming": False,
+    },
+    {
+        "name": "open-r1/codeforces",
+        "subset": "verifiable-prompts",
+        "split": "train",
+        "kind": "codeforces_python",
+        "weight": 0.35,
+        "streaming": False,
+    },
+    {
+        "name": "wikimedia/wikipedia",
+        "subset": "20231101.ar",
+        "split": "train",
+        "kind": "wikipedia_ar",
+        "weight": 0.20,
+        "streaming": True,
+    },
+]
+EVAL_SOURCE = {
+    "name": "HuggingFaceH4/CodeAlpaca_20K",
+    "subset": None,
+    "split": "test",
+    "kind": "codealpaca",
+    "streaming": False,
+}
+CODEFORCES_LANGUAGE = "python"
+# ============================================================
+# Hyperparamètres
+# ============================================================
+SEED = 42
+TARGET_VRAM_GIB = 75.0
+LEARNING_RATE = 5e-5
+MIN_LR = 5e-6
+WEIGHT_DECAY = 0.1
+WARMUP_STEPS = 200
+NUM_ROUNDS = 10
+STEPS_PER_ROUND = 2000
+MAX_STEPS = NUM_ROUNDS * STEPS_PER_ROUND   # 20000
+BATCH_SIZE = 24
+GRAD_ACCUM_STEPS = 1
+MAX_GRAD_NORM = 1.0
+EVAL_EVERY = 250
+SAVE_EVERY = 500
+MAX_EVAL_EXAMPLES = 2000
+TEXT_CHAR_LIMIT = 6000
+DTYPE = torch.bfloat16
+USE_COMPILE = True
+COMPILE_MODE = "default"
+USE_CHECKPOINTING = False
+TRAIN_NUM_WORKERS = 0
+EVAL_NUM_WORKERS = 0
+# ============================================================
+# Helpers
+# ============================================================
+def is_distributed() -> bool:
+    return dist.is_available() and dist.is_initialized()
+def get_rank() -> int:
+    return dist.get_rank() if is_distributed() else 0
+def get_world_size() -> int:
+    return dist.get_world_size() if is_distributed() else 1
+def is_main() -> bool:
+    return get_rank() == 0
+def init_distributed() -> Optional[torch.device]:
+    local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if local_rank == -1:
+        return None
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    return torch.device(f"cuda:{local_rank}")
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def get_device(ddp_device: Optional[torch.device] = None) -> torch.device:
+    if ddp_device is not None:
+        return ddp_device
+    if torch.cuda.is_available():
+        return torch.device(f"cuda:{torch.cuda.current_device()}")
+    return torch.device("cpu")
+def current_cuda_index(device: torch.device) -> int:
+    if device.type != "cuda":
+        raise ValueError("Device non CUDA")
+    return device.index if device.index is not None else torch.cuda.current_device()
+def autocast_context(device: torch.device):
+    if device.type == "cuda":
+        return torch.autocast("cuda", dtype=DTYPE)
+    return nullcontext()
+def unwrap_model(model: nn.Module) -> nn.Module:
+    m = model.module if isinstance(model, DDP) else model
+    if hasattr(m, "_orig_mod"):
+        return m._orig_mod
+    return m
+def count_parameters(model: nn.Module) -> int:
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def normalize_state_dict_keys(state_dict: dict) -> OrderedDict:
+    normalized = OrderedDict()
+    for k, v in state_dict.items():
+        nk = k
+        if nk.startswith("module._orig_mod."):
+            nk = nk[len("module._orig_mod."):]
+        elif nk.startswith("_orig_mod."):
+            nk = nk[len("_orig_mod."):]
+        elif nk.startswith("module."):
+            nk = nk[len("module."):]
+        normalized[nk] = v
+    return normalized
+def normalize_text(text: str) -> str:
+    return " ".join(text.strip().split())
+# ============================================================
+# Dataset loading / formatting
+# ============================================================
+def load_one_dataset(spec: dict):
+    kwargs = {
+        "path": spec["name"],
+        "split": spec["split"],
+        "streaming": spec["streaming"],
+    }
+    if spec["subset"] is not None:
+        kwargs["name"] = spec["subset"]
+    return load_dataset(**kwargs)
+def format_record(row: dict, kind: str) -> str:
+    if kind == "codealpaca":
+        prompt = row.get("prompt", "")
+        completion = row.get("completion", "")
+        if not isinstance(prompt, str):
+            prompt = str(prompt)
+        if not isinstance(completion, str):
+            completion = str(completion)
+        text = (
+            "### Instruction\n"
+            f"{prompt.strip()}\n\n"
+            "### Response\n"
+            f"{completion.strip()}"
+        )
+        return normalize_text(text)
+    if kind == "codeforces_python":
+        language = row.get("language", "")
+        if language != CODEFORCES_LANGUAGE:
+            return ""
+        prompt = row.get("prompt", "")
+        title = row.get("title", "")
+        if not isinstance(prompt, str):
+            prompt = str(prompt)
+        if not isinstance(title, str):
+            title = str(title)
+        text = (
+            f"### Competitive Programming Problem ({language})\n"
+            f"{title.strip()}\n\n"
+            f"{prompt.strip()}"
+        )
+        return normalize_text(text)
+    if kind == "wikipedia_ar":
+        text = row.get("text", "")
+        if not isinstance(text, str):
+            text = str(text)
+        return normalize_text(text)
+    return ""
+def example_text_iter(spec: dict, max_examples: Optional[int] = None) -> Iterator[str]:
+    ds = load_one_dataset(spec)
+    n = 0
+    for row in ds:
+        text = format_record(row, spec["kind"])
+        if not text or len(text) < 20:
+            continue
+        if TEXT_CHAR_LIMIT is not None:
+            text = text[:TEXT_CHAR_LIMIT]
+        yield text
+        n += 1
+        if max_examples is not None and n >= max_examples:
+            break
+class MixedTextSource:
+    def __init__(self, specs: list[dict]):
+        self.specs = specs
+        self.weights = [s["weight"] for s in specs]
+        self.streams = [example_text_iter(s) for s in specs]
+    def next_text(self) -> str:
+        while True:
+            idx = random.choices(range(len(self.specs)), weights=self.weights, k=1)[0]
+            try:
+                return next(self.streams[idx])
+            except StopIteration:
+                self.streams[idx] = example_text_iter(self.specs[idx])
+def packed_block_stream_mixed(
+    tokenizer: PreTrainedTokenizerFast,
+    specs: list[dict],
+    block_size: int,
+) -> Iterator[list[int]]:
+    bos, eos = tokenizer.bos_token_id, tokenizer.eos_token_id
+    buffer: list[int] = []
+    source = MixedTextSource(specs)
+    while True:
+        text = source.next_text()
+        ids = tokenizer.encode(text, add_special_tokens=False)
+        if not ids:
+            continue
+        buffer.extend([bos] + ids + [eos])
+        while len(buffer) >= block_size + 1:
+            yield buffer[: block_size + 1]
+            buffer = buffer[block_size + 1:]
+class PackedMixedBlocks(torch.utils.data.IterableDataset):
+    def __init__(self, tokenizer, specs, block_size):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.specs = specs
+        self.block_size = block_size
+    def __iter__(self):
+        worker = torch.utils.data.get_worker_info()
+        rank = get_rank()
+        world_size = get_world_size()
+        if worker is None:
+            shard_mod = world_size
+            shard_id = rank
+        else:
+            shard_mod = worker.num_workers * world_size
+            shard_id = rank * worker.num_workers + worker.id
+        for idx, chunk in enumerate(
+            packed_block_stream_mixed(
+                tokenizer=self.tokenizer,
+                specs=self.specs,
+                block_size=self.block_size,
+            )
+        ):
+            if idx % shard_mod != shard_id:
+                continue
+            yield {
+                "input_ids": torch.tensor(chunk[:-1], dtype=torch.long),
+                "labels": torch.tensor(chunk[1:], dtype=torch.long),
+            }
+class PackedEvalBlocks(torch.utils.data.IterableDataset):
+    def __init__(self, tokenizer, spec, block_size, max_examples):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.spec = spec
+        self.block_size = block_size
+        self.max_examples = max_examples
+    def __iter__(self):
+        worker = torch.utils.data.get_worker_info()
+        rank = get_rank()
+        world_size = get_world_size()
+        if worker is None:
+            shard_mod = world_size
+            shard_id = rank
+        else:
+            shard_mod = worker.num_workers * world_size
+            shard_id = rank * worker.num_workers + worker.id
+        bos, eos = self.tokenizer.bos_token_id, self.tokenizer.eos_token_id
+        buffer: list[int] = []
+        for ex_idx, text in enumerate(example_text_iter(self.spec, max_examples=self.max_examples)):
+            if ex_idx % shard_mod != shard_id:
+                continue
+            ids = self.tokenizer.encode(text, add_special_tokens=False)
+            if not ids:
+                continue
+            buffer.extend([bos] + ids + [eos])
+            while len(buffer) >= self.block_size + 1:
+                chunk = buffer[: self.block_size + 1]
+                buffer = buffer[self.block_size + 1:]
+                yield {
+                    "input_ids": torch.tensor(chunk[:-1], dtype=torch.long),
+                    "labels": torch.tensor(chunk[1:], dtype=torch.long),
+                }
+# ============================================================
+# Architecture
+# ============================================================
+@dataclass
+class GPTConfig:
+    vocab_size: int
+    block_size: int
+    d_model: int
+    n_heads: int
+    n_layers: int
+    d_ff: int
+    dropout: float = 0.0
+    use_checkpointing: bool = False
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.weight * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: int = 10000, max_seq: int = 4096):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(max_seq).float()
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cache", torch.repeat_interleave(freqs.cos(), 2, dim=-1), persistent=False)
+        self.register_buffer("sin_cache", torch.repeat_interleave(freqs.sin(), 2, dim=-1), persistent=False)
+    def forward(self, seq_len: int, dtype: torch.dtype):
+        return self.cos_cache[:seq_len].to(dtype), self.sin_cache[:seq_len].to(dtype)
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+    return x * cos + rotate_half(x) * sin
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        assert cfg.d_model % cfg.n_heads == 0
+        self.n_heads = cfg.n_heads
+        self.head_dim = cfg.d_model // cfg.n_heads
+        self.qkv = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout_p = cfg.dropout
+        self.rope = RotaryEmbedding(self.head_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, t, c = x.shape
+        q, k, v = self.qkv(x).split(c, dim=-1)
+        q = q.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rope(t, x.dtype)
+        q = apply_rope(q, cos, sin)
+        k = apply_rope(k, cos, sin)
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            dropout_p=self.dropout_p if self.training else 0.0,
+            is_causal=True,
+        )
+        y = y.transpose(1, 2).contiguous().view(b, t, c)
+        return self.proj(y)
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.w1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w2 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w3 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class Block(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.ln1 = RMSNorm(cfg.d_model)
+        self.attn = CausalSelfAttention(cfg)
+        self.ln2 = RMSNorm(cfg.d_model)
+        self.ff = SwiGLU(cfg)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ff(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.ln_f = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        self.lm_head.weight = self.tok_emb.weight
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m: nn.Module) -> None:
+        if isinstance(m, (nn.Linear, nn.Embedding)):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.zeros_(m.bias)
+    def forward(self, input_ids: torch.Tensor, labels: Optional[torch.Tensor] = None):
+        x = self.tok_emb(input_ids)
+        for block in self.blocks:
+            if self.cfg.use_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+        logits = self.lm_head(self.ln_f(x))
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                labels.reshape(-1),
+                ignore_index=-100,
+            )
+        return logits, loss
+# ============================================================
+# Optimizer / LR
+# ============================================================
+def build_optimizer(model: nn.Module) -> torch.optim.Optimizer:
+    decay, no_decay = [], []
+    for name, p in unwrap_model(model).named_parameters():
+        if not p.requires_grad:
+            continue
+        (decay if p.ndim >= 2 and "weight" in name else no_decay).append(p)
+    return torch.optim.AdamW(
+        [
+            {"params": decay, "weight_decay": WEIGHT_DECAY},
+            {"params": no_decay, "weight_decay": 0.0},
+        ],
+        lr=LEARNING_RATE,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+        fused=torch.cuda.is_available(),
+    )
+def cosine_lr(step: int) -> float:
+    if step < WARMUP_STEPS:
+        return LEARNING_RATE * step / max(1, WARMUP_STEPS)
+    p = min(1.0, (step - WARMUP_STEPS) / max(1, MAX_STEPS - WARMUP_STEPS))
+    return MIN_LR + 0.5 * (LEARNING_RATE - MIN_LR) * (1 + math.cos(math.pi * p))
+# ============================================================
+# Checkpoints
+# ============================================================
+def load_base_config() -> GPTConfig:
+    cfg_dict = json.loads(BASE_CONFIG_FILE.read_text(encoding="utf-8"))
+    cfg_dict["use_checkpointing"] = USE_CHECKPOINTING
+    return GPTConfig(**cfg_dict)
+def initialize_model_from_base(model: nn.Module, device: torch.device) -> None:
+    if not BASE_CHECKPOINT.exists():
+        raise FileNotFoundError(f"Checkpoint de base introuvable: {BASE_CHECKPOINT}")
+    ckpt = torch.load(BASE_CHECKPOINT, map_location=device)
+    state_dict = normalize_state_dict_keys(ckpt["model"])
+    unwrap_model(model).load_state_dict(state_dict, strict=True)
+def save_checkpoint(model, optimizer, step, best_loss, path):
+    raw = unwrap_model(model)
+    model_state = normalize_state_dict_keys(raw.state_dict())
+    torch.save(
+        {
+            "model": model_state,
+            "optimizer": optimizer.state_dict(),
+            "step": step,
+            "best_loss": best_loss,
+            "config": asdict(raw.cfg),
+        },
+        path,
+    )
+def load_resume_checkpoint(model, optimizer, path, device) -> tuple[int, float]:
+    ckpt = torch.load(path, map_location=device)
+    raw = unwrap_model(model)
+    model_state = normalize_state_dict_keys(ckpt["model"])
+    raw.load_state_dict(model_state, strict=True)
+    try:
+        optimizer.load_state_dict(ckpt["optimizer"])
+    except Exception as e:
+        print(f"[warn] Optimizer state non repris: {e}")
+    return int(ckpt.get("step", 0)), float(ckpt.get("best_loss", 1e9))
+# ============================================================
+# Evaluation
+# ============================================================
+@torch.no_grad()
+def evaluate(model, loader, device, max_batches: int = 100) -> float:
+    model.eval()
+    losses = []
+    for i, batch in enumerate(loader):
+        if i >= max_batches:
+            break
+        inp = batch["input_ids"].to(device, non_blocking=True)
+        lbl = batch["labels"].to(device, non_blocking=True)
+        with autocast_context(device):
+            _, loss = model(inp, lbl)
+        losses.append(loss.item())
+    model.train()
+    return sum(losses) / max(1, len(losses))
+# ============================================================
+# Main
+# ============================================================
+def main() -> None:
+    ddp_device = init_distributed()
+    set_seed(SEED + get_rank())
+    device = get_device(ddp_device)
+    cuda_device_index = None
+    vram_fraction = None
+    if device.type == "cuda":
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.set_float32_matmul_precision("high")
+        cuda_device_index = current_cuda_index(device)
+        _, total_mem_bytes = torch.cuda.mem_get_info(cuda_device_index)
+        target_bytes = int(TARGET_VRAM_GIB * (1024 ** 3))
+        vram_fraction = min(target_bytes / total_mem_bytes, 0.999)
+        torch.cuda.memory.set_per_process_memory_fraction(
+            vram_fraction,
+            device=cuda_device_index,
+        )
+    if is_main():
+        print("=" * 60)
+        print(" Re-train même modèle | 10 x 2000 steps")
+        print("=" * 60)
+        print(f"Device: {device} | World: {get_world_size()} GPU(s)")
+        if device.type == "cuda":
+            free_mem, total_mem = torch.cuda.mem_get_info(cuda_device_index)
+            print(f"GPU: {torch.cuda.get_device_name(cuda_device_index)}")
+            print(f"VRAM cible: {TARGET_VRAM_GIB:.1f} GiB")
+            print(f"Fraction PyTorch: {vram_fraction:.4f}")
+            print(f"GPU total: {total_mem / 1024**3:.2f} GiB | libre: {free_mem / 1024**3:.2f} GiB")
+        print(f"Rounds: {NUM_ROUNDS} | Steps/round: {STEPS_PER_ROUND} | MAX_STEPS: {MAX_STEPS}")
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(str(BASE_TOKENIZER_DIR))
+    cfg = load_base_config()
+    cfg.vocab_size = len(tokenizer)
+    if is_main():
+        CONFIG_FILE.write_text(
+            json.dumps(asdict(cfg), indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+        print(f"Base checkpoint: {BASE_CHECKPOINT}")
+        print(f"Tokenizer: {BASE_TOKENIZER_DIR}")
+    model = GPT(cfg).to(device)
+    initialize_model_from_base(model, device)
+    if USE_COMPILE and hasattr(torch, "compile"):
+        model = torch.compile(model, mode=COMPILE_MODE)
+        if is_main():
+            print(f"torch.compile activé ({COMPILE_MODE})")
+    if is_distributed():
+        model = DDP(model, device_ids=[device.index])
+    optimizer = build_optimizer(model)
+    start_step, best_eval = 0, 1e9
+    if STATE_FILE.exists():
+        try:
+            if is_main():
+                print(f"Reprise depuis {STATE_FILE}")
+            start_step, best_eval = load_resume_checkpoint(model, optimizer, STATE_FILE, device)
+        except Exception as e:
+            if is_main():
+                bad_path = STATE_FILE.with_suffix(".corrupt.pt")
+                print(f"[warn] Checkpoint illisible: {e}")
+                try:
+                    STATE_FILE.rename(bad_path)
+                    print(f"[warn] Checkpoint corrompu renommé vers {bad_path}")
+                except Exception:
+                    pass
+                print("[warn] Reprise ignorée, démarrage depuis le checkpoint de base.")
+            start_step, best_eval = 0, 1e9
+    if start_step >= MAX_STEPS:
+        if is_main():
+            print(f"[warn] start_step={start_step} >= MAX_STEPS={MAX_STEPS}")
+            print("[warn] Rien à entraîner.")
+        return
+    train_ds = PackedMixedBlocks(
+        tokenizer=tokenizer,
+        specs=TRAIN_SOURCES,
+        block_size=cfg.block_size,
+    )
+    eval_ds = PackedEvalBlocks(
+        tokenizer=tokenizer,
+        spec=EVAL_SOURCE,
+        block_size=cfg.block_size,
+        max_examples=MAX_EVAL_EXAMPLES,
+    )
+    train_loader = torch.utils.data.DataLoader(
+        train_ds,
+        batch_size=BATCH_SIZE,
+        num_workers=TRAIN_NUM_WORKERS,
+        pin_memory=(device.type == "cuda"),
+    )
+    eval_loader = torch.utils.data.DataLoader(
+        eval_ds,
+        batch_size=BATCH_SIZE,
+        num_workers=EVAL_NUM_WORKERS,
+        pin_memory=(device.type == "cuda"),
+    )
+    if is_main():
+        raw_model = unwrap_model(model)
+        n_params = count_parameters(raw_model)
+        print(f"Paramètres: {n_params / 1e6:.1f}M")
+        print(f"Architecture: d={cfg.d_model} | heads={cfg.n_heads} | layers={cfg.n_layers} | block={cfg.block_size}")
+        print(f"Batch size: {BATCH_SIZE} | Grad accum: {GRAD_ACCUM_STEPS}")
+        print(f"Dtype: {DTYPE} | Compile: {USE_COMPILE} ({COMPILE_MODE if USE_COMPILE else 'off'})")
+    model.train()
+    optimizer.zero_grad(set_to_none=True)
+    train_iter = iter(train_loader)
+    step = start_step
+    t0 = time.time()
+    log_loss_sum = 0.0
+    log_loss_count = 0
+    tokens_since_log = 0
+    last_log = time.time()
+    if device.type == "cuda":
+        torch.cuda.reset_peak_memory_stats(cuda_device_index)
+    current_round = (step // STEPS_PER_ROUND) + 1
+    while step < MAX_STEPS:
+        for _ in range(GRAD_ACCUM_STEPS):
+            batch = next(train_iter)
+            inp = batch["input_ids"].to(device, non_blocking=True)
+            lbl = batch["labels"].to(device, non_blocking=True)
+            with autocast_context(device):
+                _, loss = model(inp, lbl)
+            (loss / GRAD_ACCUM_STEPS).backward()
+            log_loss_sum += loss.item()
+            log_loss_count += 1
+            tokens_since_log += inp.numel()
+        lr = cosine_lr(step)
+        for group in optimizer.param_groups:
+            group["lr"] = lr
+        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+        step += 1
+        new_round = ((step - 1) // STEPS_PER_ROUND) + 1
+        if new_round != current_round and is_main():
+            current_round = new_round
+            print(f"\n===== Round {current_round}/{NUM_ROUNDS} =====")
+        if step % 50 == 0 and is_main():
+            now = time.time()
+            elapsed = max(1e-6, now - last_log)
+            tok_s = tokens_since_log / elapsed
+            avg_loss = log_loss_sum / max(1, log_loss_count)
+            round_idx = ((step - 1) // STEPS_PER_ROUND) + 1
+            step_in_round = ((step - 1) % STEPS_PER_ROUND) + 1
+            print(
+                f"round {round_idx:2d}/{NUM_ROUNDS} | "
+                f"step {step_in_round:4d}/{STEPS_PER_ROUND} | "
+                f"global {step:5d}/{MAX_STEPS} | "
+                f"loss={avg_loss:.4f} | lr={lr:.2e} | {tok_s:,.0f} tok/s"
+            )
+            if device.type == "cuda":
+                allocated = torch.cuda.memory_allocated(cuda_device_index) / 1024**3
+                reserved = torch.cuda.memory_reserved(cuda_device_index) / 1024**3
+                max_alloc = torch.cuda.max_memory_allocated(cuda_device_index) / 1024**3
+                max_reserved = torch.cuda.max_memory_reserved(cuda_device_index) / 1024**3
+                print(
+                    f"GPU mem | alloc={allocated:.2f} GiB | reserved={reserved:.2f} GiB | "
+                    f"max_alloc={max_alloc:.2f} GiB | max_reserved={max_reserved:.2f} GiB"
+                )
+            last_log = now
+            tokens_since_log = 0
+            log_loss_sum = 0.0
+            log_loss_count = 0
+        if step % EVAL_EVERY == 0 and is_main():
+            val_loss = evaluate(model, eval_loader, device)
+            print(f"[eval] global step {step:5d} | val_loss={val_loss:.4f}")
+            if val_loss < best_eval:
+                best_eval = val_loss
+                save_checkpoint(model, optimizer, step, best_eval, BEST_MODEL_FILE)
+                print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}")
+        if step % SAVE_EVERY == 0 and is_main():
+            save_checkpoint(model, optimizer, step, best_eval, STATE_FILE)
+            save_checkpoint(model, optimizer, step, best_eval, MODEL_FILE)
+            print(f"✓ Checkpoint → {MODEL_FILE}")
+        if step % STEPS_PER_ROUND == 0 and is_main():
+            round_no = step // STEPS_PER_ROUND
+            round_ckpt = OUT_DIR / f"model_round_{round_no:02d}.pt"
+            save_checkpoint(model, optimizer, step, best_eval, round_ckpt)
+            print(f"✓ Fin round {round_no}/{NUM_ROUNDS} → {round_ckpt}")
+    if is_main():
+        save_checkpoint(model, optimizer, step, best_eval, MODEL_FILE)
+        save_checkpoint(model, optimizer, step, best_eval, STATE_FILE)
+        total = (time.time() - t0) / 60
+        print(f"\nModèle final → {MODEL_FILE}")
+        print(f"Meilleur modèle → {BEST_MODEL_FILE}")
+        print(f"Temps total : {total:.1f} min")
+        print(f"Steps effectués : {step}")
+    if is_distributed():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

train2.py ADDED Viewed

	@@ -0,0 +1,852 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+train_nlp_h100_optimized.py  — v2 (bugfix device mismatch)
+===========================================================
+Corrections vs v1 :
+  • apply_qlora() appelé APRÈS model.to(device) → lora_A/lora_B naissent sur CUDA
+  • LoRALinear.__init__ : move explicite des adaptateurs sur le device du base_layer
+  • torch.compile désactivé quand USE_CHECKPOINTING=True (conflict dynamo+checkpoint
+    avec sous-modules custom) — on utilise COMPILE_AFTER_CKPT pour les cas où on
+    veut quand même compiler (USE_CHECKPOINTING=False)
+  • Ajout d'un fallback propre : si compile crash, on continue sans compile
+"""
+from __future__ import annotations
+import itertools
+import json
+import math
+import os
+import random
+import time
+from collections import OrderedDict
+from contextlib import nullcontext
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterator, Optional
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import bitsandbytes as bnb
+    from bitsandbytes.nn import Params4bit
+    HAS_BNB = True
+except ImportError:
+    HAS_BNB = False
+    print("[warn] bitsandbytes non disponible – quantification 4-bit désactivée")
+try:
+    from flash_attn import flash_attn_func
+    HAS_FLASH = True
+except ImportError:
+    HAS_FLASH = False
+    print("[warn] flash-attn non disponible – fallback F.scaled_dot_product_attention")
+from datasets import load_dataset
+from torch.nn.parallel import DistributedDataParallel as DDP
+from tokenizers import (
+    Tokenizer, decoders, models, normalizers,
+    pre_tokenizers, processors, trainers,
+)
+from transformers import PreTrainedTokenizerFast
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  CHEMINS                                                                    ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+OUT_DIR        = Path("./nlp_1b_h100_opt")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+TOKENIZER_DIR  = OUT_DIR / "tokenizer_32k"
+CONFIG_FILE    = OUT_DIR / "config.json"
+MODEL_FILE     = OUT_DIR / "model.pt"
+BEST_MODEL_FILE= OUT_DIR / "model_best.pt"
+STATE_FILE     = OUT_DIR / "train_state.pt"
+BASE_CHECKPOINT: Optional[Path] = None
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  HYPERPARAMÈTRES                                                            ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+SEED           = 42
+TARGET_VRAM_GIB= 78.0
+BLOCK_SIZE = 1024
+VOCAB_SIZE = 32_000
+D_MODEL    = 1536
+N_HEADS    = 24
+N_LAYERS   = 24
+D_FF       = 6144
+DROPOUT    = 0.0
+USE_QLORA           = True
+LORA_R              = 64
+LORA_ALPHA          = 128
+LORA_DROPOUT        = 0.05
+LORA_TARGET_MODULES = ["qkv", "proj", "w1", "w2", "w3"]
+NUM_EPOCHS       = 10
+LEARNING_RATE    = 3e-4
+MIN_LR           = 3e-5
+WEIGHT_DECAY     = 0.1
+WARMUP_STEPS     = 500
+# ┌─────────────────────────────────────────────────────────────────────────────┐
+# │  RÉGLAGE BATCH SIZE → 78 Go VRAM                                           │
+# │  Démarrer : BATCH_SIZE=8, GRAD_ACCUM_STEPS=2                               │
+# │  Augmenter BATCH_SIZE par +2 jusqu'à max_reserved ≈ 77 Go dans les logs   │
+# │  Si OOM   : BATCH_SIZE -= 1  ou  USE_CHECKPOINTING=True                   │
+# └─────────────────────────────────────────────────────────────────────────────┘
+BATCH_SIZE       = 16
+GRAD_ACCUM_STEPS = 1
+MAX_GRAD_NORM    = 1.0
+EVAL_EVERY       = 500
+SAVE_EVERY       = 1_000
+DTYPE             = torch.bfloat16
+# ── Compile : désactivé quand USE_CHECKPOINTING=True pour éviter le conflict
+#    dynamo ↔ checkpoint ↔ sous-modules custom (LoRALinear).
+#    Mettre USE_CHECKPOINTING=False ET USE_COMPILE=True pour vitesse max.
+USE_CHECKPOINTING = False    # économise ~8× activations VRAM
+USE_COMPILE       = True   # ← mettre True seulement si USE_CHECKPOINTING=False
+COMPILE_MODE      = "reduce-overhead"
+TRAIN_NUM_WORKERS = 4
+EVAL_NUM_WORKERS  = 2
+PREFETCH_FACTOR   = 2
+TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000
+TOKENIZER_CHAR_LIMIT             = 2_000
+TEXT_CHAR_LIMIT                  = 4_000
+SPECIAL_TOKENS = ["<pad>", "<bos>", "<eos>", "<unk>"]
+PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN = SPECIAL_TOKENS
+WIKI_CONFIGS               = ["20231101.en", "20231101.fr", "20231101.ar"]
+FINEWEB_CONFIG             = "sample-10BT"
+DEV_DOCS_PER_WIKI_CONFIG   = 1_500
+DEV_DOCS_FINEWEB           = 3_000
+TRAIN_DOCS_PER_WIKI_CONFIG = 30_000
+TRAIN_DOCS_FINEWEB         = 60_000
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DISTRIBUTED                                                                ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def is_distributed() -> bool:
+    return dist.is_available() and dist.is_initialized()
+def get_rank() -> int:
+    return dist.get_rank() if is_distributed() else 0
+def get_world_size() -> int:
+    return dist.get_world_size() if is_distributed() else 1
+def is_main() -> bool:
+    return get_rank() == 0
+def init_distributed() -> Optional[torch.device]:
+    local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if local_rank == -1:
+        return None
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    return torch.device(f"cuda:{local_rank}")
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def get_device(ddp_device: Optional[torch.device] = None) -> torch.device:
+    if ddp_device is not None:
+        return ddp_device
+    if torch.cuda.is_available():
+        return torch.device(f"cuda:{torch.cuda.current_device()}")
+    return torch.device("cpu")
+def current_cuda_index(device: torch.device) -> int:
+    return device.index if device.index is not None else torch.cuda.current_device()
+def autocast_context(device: torch.device):
+    if device.type == "cuda":
+        return torch.autocast("cuda", dtype=DTYPE)
+    return nullcontext()
+def unwrap_model(model: nn.Module) -> nn.Module:
+    m = model.module if isinstance(model, DDP) else model
+    return m._orig_mod if hasattr(m, "_orig_mod") else m
+def count_parameters(model: nn.Module, trainable_only: bool = True) -> int:
+    return sum(p.numel() for p in model.parameters() if not trainable_only or p.requires_grad)
+def normalize_state_dict_keys(sd: dict) -> OrderedDict:
+    out = OrderedDict()
+    for k, v in sd.items():
+        for prefix in ("module._orig_mod.", "_orig_mod.", "module."):
+            if k.startswith(prefix):
+                k = k[len(prefix):]
+                break
+        out[k] = v
+    return out
+def normalize_text(t: str) -> str:
+    return " ".join(t.strip().split())
+def safe_str(x) -> str:
+    return x if isinstance(x, str) else ("" if x is None else str(x))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DATASETS                                                                   ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def load_wiki_stream(cfg_name: str):
+    return load_dataset("wikimedia/wikipedia", cfg_name, split="train", streaming=True)
+def load_fineweb_stream():
+    return load_dataset("HuggingFaceFW/fineweb-edu", FINEWEB_CONFIG, split="train", streaming=True)
+def stream_texts(ds, start: int, count: int, char_limit: int) -> Iterator[str]:
+    for row in itertools.islice(ds, start, start + count):
+        text = normalize_text(safe_str(row.get("text", "")))
+        if len(text) >= 20:
+            yield text[:char_limit]
+def tokenizer_training_iterator() -> Iterator[str]:
+    for c in WIKI_CONFIGS:
+        yield from stream_texts(load_wiki_stream(c), 0, TOKENIZER_SAMPLE_DOCS_PER_SOURCE, TOKENIZER_CHAR_LIMIT)
+    yield from stream_texts(load_fineweb_stream(), 0, TOKENIZER_SAMPLE_DOCS_PER_SOURCE, TOKENIZER_CHAR_LIMIT)
+def build_epoch_train_texts(epoch: int) -> list[str]:
+    texts: list[str] = []
+    for c in WIKI_CONFIGS:
+        start = DEV_DOCS_PER_WIKI_CONFIG + epoch * TRAIN_DOCS_PER_WIKI_CONFIG
+        texts.extend(stream_texts(load_wiki_stream(c), start, TRAIN_DOCS_PER_WIKI_CONFIG, TEXT_CHAR_LIMIT))
+    start = DEV_DOCS_FINEWEB + epoch * TRAIN_DOCS_FINEWEB
+    texts.extend(stream_texts(load_fineweb_stream(), start, TRAIN_DOCS_FINEWEB, TEXT_CHAR_LIMIT))
+    random.Random(SEED + epoch).shuffle(texts)
+    return texts
+def build_eval_texts() -> list[str]:
+    texts: list[str] = []
+    for c in WIKI_CONFIGS:
+        texts.extend(stream_texts(load_wiki_stream(c), 0, DEV_DOCS_PER_WIKI_CONFIG, TEXT_CHAR_LIMIT))
+    texts.extend(stream_texts(load_fineweb_stream(), 0, DEV_DOCS_FINEWEB, TEXT_CHAR_LIMIT))
+    return texts
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  PACKED DATASET                                                             ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+class PackedTextList(torch.utils.data.IterableDataset):
+    def __init__(self, texts, tokenizer, block_size, epoch_seed=0):
+        super().__init__()
+        self.texts      = texts
+        self.tokenizer  = tokenizer
+        self.block_size = block_size
+        self.epoch_seed = epoch_seed
+    def __iter__(self):
+        worker     = torch.utils.data.get_worker_info()
+        rank, ws   = get_rank(), get_world_size()
+        if worker is None:
+            shard_mod, shard_id = ws, rank
+        else:
+            shard_mod = worker.num_workers * ws
+            shard_id  = rank * worker.num_workers + worker.id
+        rng     = random.Random(self.epoch_seed)
+        indices = list(range(len(self.texts)))
+        rng.shuffle(indices)
+        bos, eos = self.tokenizer.bos_token_id, self.tokenizer.eos_token_id
+        buf: list[int] = []
+        for li, ti in enumerate(indices):
+            if li % shard_mod != shard_id:
+                continue
+            ids = self.tokenizer.encode(self.texts[ti], add_special_tokens=False)
+            if not ids:
+                continue
+            buf.extend([bos] + ids + [eos])
+            while len(buf) >= self.block_size + 1:
+                chunk = buf[: self.block_size + 1]
+                buf   = buf[self.block_size + 1 :]
+                yield {
+                    "input_ids": torch.tensor(chunk[:-1], dtype=torch.long),
+                    "labels":    torch.tensor(chunk[1:],  dtype=torch.long),
+                }
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  TOKENIZER                                                                  ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def tokenizer_ready() -> bool:
+    return (TOKENIZER_DIR / "tokenizer.json").exists() and (TOKENIZER_DIR / "tokenizer_config.json").exists()
+def train_tokenizer_once() -> None:
+    TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
+    tok = Tokenizer(models.BPE(unk_token=UNK_TOKEN))
+    tok.normalizer    = normalizers.Sequence([normalizers.NFKC()])
+    tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    tok.decoder       = decoders.ByteLevel()
+    trainer = trainers.BpeTrainer(
+        vocab_size=VOCAB_SIZE, min_frequency=2, show_progress=is_main(),
+        special_tokens=SPECIAL_TOKENS, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+    )
+    tok.train_from_iterator(tokenizer_training_iterator(), trainer=trainer)
+    bos_id, eos_id = tok.token_to_id(BOS_TOKEN), tok.token_to_id(EOS_TOKEN)
+    tok.post_processor = processors.TemplateProcessing(
+        single=f"{BOS_TOKEN} $A {EOS_TOKEN}",
+        pair=f"{BOS_TOKEN} $A {EOS_TOKEN} $B:1 {EOS_TOKEN}:1",
+        special_tokens=[(BOS_TOKEN, bos_id), (EOS_TOKEN, eos_id)],
+    )
+    tok.save(str(TOKENIZER_DIR / "tokenizer.json"))
+    fast = PreTrainedTokenizerFast(
+        tokenizer_file=str(TOKENIZER_DIR / "tokenizer.json"),
+        bos_token=BOS_TOKEN, eos_token=EOS_TOKEN, unk_token=UNK_TOKEN, pad_token=PAD_TOKEN,
+    )
+    fast.save_pretrained(str(TOKENIZER_DIR))
+def train_or_load_tokenizer() -> PreTrainedTokenizerFast:
+    TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
+    if not tokenizer_ready():
+        if is_distributed():
+            if is_main():
+                print("Entraînement tokenizer 32k…"); train_tokenizer_once()
+            dist.barrier()
+        else:
+            print("Entraînement tokenizer 32k…"); train_tokenizer_once()
+    return PreTrainedTokenizerFast.from_pretrained(str(TOKENIZER_DIR))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  MODÈLE                                                                     ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+@dataclass
+class GPTConfig:
+    vocab_size: int   = VOCAB_SIZE
+    block_size: int   = BLOCK_SIZE
+    d_model:    int   = D_MODEL
+    n_heads:    int   = N_HEADS
+    n_layers:   int   = N_LAYERS
+    d_ff:       int   = D_FF
+    dropout:    float = DROPOUT
+    use_checkpointing: bool = USE_CHECKPOINTING
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps    = eps
+    def forward(self, x):
+        return self.weight * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: int = 10_000, max_seq: int = 4_096):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t     = torch.arange(max_seq).float()
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cache", torch.repeat_interleave(freqs.cos(), 2, dim=-1), persistent=False)
+        self.register_buffer("sin_cache", torch.repeat_interleave(freqs.sin(), 2, dim=-1), persistent=False)
+    def forward(self, seq_len: int, dtype: torch.dtype):
+        return self.cos_cache[:seq_len].to(dtype), self.sin_cache[:seq_len].to(dtype)
+def rotate_half(x):
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+def apply_rope(x, cos, sin):
+    return x * cos.unsqueeze(0).unsqueeze(0) + rotate_half(x) * sin.unsqueeze(0).unsqueeze(0)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        assert cfg.d_model % cfg.n_heads == 0
+        self.n_heads   = cfg.n_heads
+        self.head_dim  = cfg.d_model // cfg.n_heads
+        self.qkv       = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.proj      = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout_p = cfg.dropout
+        self.rope      = RotaryEmbedding(self.head_dim)
+    def forward(self, x):
+        b, t, c = x.shape
+        q, k, v = self.qkv(x).split(c, dim=-1)
+        q = q.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rope(t, x.dtype)
+        q, k = apply_rope(q, cos, sin), apply_rope(k, cos, sin)
+        if HAS_FLASH:
+            # Flash Attention 2 attend (b, t, nh, hd)
+            q = q.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
+            y = flash_attn_func(q, k, v, dropout_p=self.dropout_p if self.training else 0.0, causal=True)
+            y = y.reshape(b, t, c)
+        else:
+            y = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout_p if self.training else 0.0, is_causal=True)
+            y = y.transpose(1, 2).contiguous().view(b, t, c)
+        return self.proj(y)
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.w1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w2 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w3 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+    def forward(self, x):
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class Block(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.ln1  = RMSNorm(cfg.d_model)
+        self.attn = CausalSelfAttention(cfg)
+        self.ln2  = RMSNorm(cfg.d_model)
+        self.ff   = SwiGLU(cfg)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ff(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.cfg     = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.blocks  = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.ln_f    = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        self.lm_head.weight = self.tok_emb.weight  # weight tying
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, (nn.Linear, nn.Embedding)):
+            nn.init.normal_(m.weight, 0.0, 0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.zeros_(m.bias)
+    def forward(self, input_ids, labels=None):
+        x = self.tok_emb(input_ids)
+        for block in self.blocks:
+            if self.cfg.use_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+        logits = self.lm_head(self.ln_f(x))
+        loss   = None
+        if labels is not None:
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), labels.reshape(-1), ignore_index=-100)
+        return logits, loss
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  QLORA                                                                      ║
+# ║                                                                             ║
+# ║  CORRECTIF CLÉ : apply_qlora() DOIT être appelé APRÈS model.to(device).   ║
+# ║  LoRALinear détecte automatiquement le device du base_layer et y crée      ║
+# ║  lora_A / lora_B directement, sans besoin de .to() séparé.                ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+class LoRALinear(nn.Module):
+    """
+    Adaptateur LoRA autour d'un nn.Linear existant.
+    IMPORTANT : les sous-modules lora_A et lora_B sont créés sur le MÊME
+    device que base_layer.weight via le move explicite ci-dessous.
+    C'est la correction du bug 'cuda:0 vs cpu' de la v1.
+    """
+    def __init__(self, base_layer: nn.Linear, r: int = LORA_R, alpha: int = LORA_ALPHA, dropout: float = LORA_DROPOUT):
+        super().__init__()
+        self.base  = base_layer
+        self.r     = r
+        self.scale = alpha / r
+        in_f, out_f = base_layer.in_features, base_layer.out_features
+        # ── Détecter le device du base_layer ──────────────────────────────────
+        # base_layer.weight peut être un Params4bit (pas de .device direct)
+        try:
+            dev = next(base_layer.parameters()).device
+        except StopIteration:
+            dev = torch.device("cpu")
+        # Créer les adaptateurs DIRECTEMENT sur le bon device
+        self.lora_A = nn.Linear(in_f, r,    bias=False, device=dev)
+        self.lora_B = nn.Linear(r,    out_f, bias=False, device=dev)
+        self.drop   = nn.Dropout(dropout)
+        # Initialisation standard LoRA
+        nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_B.weight)
+        # Geler les poids de base
+        for p in self.base.parameters():
+            p.requires_grad = False
+    def forward(self, x):
+        return self.base(x) + self.lora_B(self.lora_A(self.drop(x))) * self.scale
+def apply_qlora(model: GPT, device: torch.device) -> GPT:
+    """
+    Remplace les couches cibles par LoRALinear.
+    À appeler IMPÉRATIVEMENT après model.to(device).
+    """
+    if not USE_QLORA:
+        return model
+    replaced = 0
+    # Collecter d'abord pour éviter de modifier le dict pendant l'itération
+    targets = []
+    for name, module in model.named_modules():
+        parts = name.split(".")
+        if parts[-1] in LORA_TARGET_MODULES and isinstance(module, nn.Linear):
+            targets.append((name, module))
+    for name, module in targets:
+        parts  = name.split(".")
+        parent = model
+        for part in parts[:-1]:
+            parent = getattr(parent, part)
+        lora_layer = LoRALinear(module)
+        setattr(parent, parts[-1], lora_layer)
+        replaced += 1
+    if is_main():
+        print(f"QLoRA : {replaced} couches remplacées (device={device}, NF4={HAS_BNB})")
+    return model
+def freeze_base_weights(model: GPT) -> None:
+    """Seuls lora_A et lora_B restent entraînables."""
+    for name, p in model.named_parameters():
+        p.requires_grad = ("lora_A" in name or "lora_B" in name)
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  OPTIMIZER                                                                  ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def build_optimizer(model: nn.Module) -> torch.optim.Optimizer:
+    decay, no_decay = [], []
+    for name, p in unwrap_model(model).named_parameters():
+        if not p.requires_grad:
+            continue
+        (decay if p.ndim >= 2 and "weight" in name else no_decay).append(p)
+    groups = [
+        {"params": decay,    "weight_decay": WEIGHT_DECAY},
+        {"params": no_decay, "weight_decay": 0.0},
+    ]
+    if HAS_BNB:
+        return bnb.optim.PagedAdamW8bit(groups, lr=LEARNING_RATE, betas=(0.9, 0.95), eps=1e-8)
+    return torch.optim.AdamW(groups, lr=LEARNING_RATE, betas=(0.9, 0.95), eps=1e-8, fused=torch.cuda.is_available())
+def cosine_lr(step: int, total_steps: int) -> float:
+    if step < WARMUP_STEPS:
+        return LEARNING_RATE * step / max(1, WARMUP_STEPS)
+    p = min(1.0, (step - WARMUP_STEPS) / max(1, total_steps - WARMUP_STEPS))
+    return MIN_LR + 0.5 * (LEARNING_RATE - MIN_LR) * (1 + math.cos(math.pi * p))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  CHECKPOINT                                                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def save_checkpoint(model, optimizer, epoch, step, best_loss, path):
+    raw = unwrap_model(model)
+    torch.save({
+        "model": normalize_state_dict_keys(raw.state_dict()),
+        "optimizer": optimizer.state_dict(),
+        "epoch": epoch, "step": step, "best_loss": best_loss,
+        "config": asdict(raw.cfg),
+    }, path)
+def maybe_load_base_checkpoint(model, device):
+    if BASE_CHECKPOINT is None or not Path(BASE_CHECKPOINT).exists():
+        return
+    ckpt = torch.load(BASE_CHECKPOINT, map_location=device)
+    unwrap_model(model).load_state_dict(normalize_state_dict_keys(ckpt["model"]), strict=False)
+def load_resume_checkpoint(model, optimizer, path, device):
+    ckpt = torch.load(path, map_location=device)
+    unwrap_model(model).load_state_dict(normalize_state_dict_keys(ckpt["model"]), strict=True)
+    try:
+        optimizer.load_state_dict(ckpt["optimizer"])
+    except Exception as e:
+        print(f"[warn] Optimizer state non repris: {e}")
+    return int(ckpt.get("epoch", 0)), int(ckpt.get("step", 0)), float(ckpt.get("best_loss", 1e9))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  ÉVALUATION                                                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+@torch.no_grad()
+def evaluate(model, loader, device, max_batches=200) -> float:
+    model.eval()
+    losses = []
+    for i, batch in enumerate(loader):
+        if i >= max_batches:
+            break
+        inp = batch["input_ids"].to(device, non_blocking=True)
+        lbl = batch["labels"].to(device, non_blocking=True)
+        with autocast_context(device):
+            _, loss = model(inp, lbl)
+        losses.append(loss.item())
+    model.train()
+    return sum(losses) / max(1, len(losses))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DATALOADER                                                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def make_loader(dataset, batch_size, num_workers, is_cuda):
+    kwargs = dict(batch_size=batch_size, num_workers=num_workers, pin_memory=is_cuda)
+    if num_workers > 0:
+        kwargs["persistent_workers"] = True
+        kwargs["prefetch_factor"]    = PREFETCH_FACTOR
+    return torch.utils.data.DataLoader(dataset, **kwargs)
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  MAIN                                                                       ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def main() -> None:
+    ddp_device = init_distributed()
+    set_seed(SEED + get_rank())
+    device  = get_device(ddp_device)
+    is_cuda = device.type == "cuda"
+    cuda_idx      = None
+    vram_fraction = None
+    if is_cuda:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32       = True
+        torch.set_float32_matmul_precision("high")
+        cuda_idx = current_cuda_index(device)
+        _, total = torch.cuda.mem_get_info(cuda_idx)
+        vram_fraction = min(TARGET_VRAM_GIB * (1024**3) / total, 0.999)
+        torch.cuda.memory.set_per_process_memory_fraction(vram_fraction, device=cuda_idx)
+    if is_main():
+        print("=" * 72)
+        print(" GPT ~1B | H100 80 Go | QLoRA + BF16 + TF32 | v2 (device fix)")
+        print("=" * 72)
+        print(f"Device  : {device} | World: {get_world_size()} GPU(s)")
+        print(f"Flash-2 : {HAS_FLASH} | BNB 4-bit: {HAS_BNB} | QLoRA: {USE_QLORA}")
+        print(f"Grad ckpt: {USE_CHECKPOINTING} | Compile: {USE_COMPILE} ({COMPILE_MODE})")
+        if is_cuda:
+            free, total = torch.cuda.mem_get_info(cuda_idx)
+            print(f"GPU     : {torch.cuda.get_device_name(cuda_idx)}")
+            print(f"VRAM    : {total/1024**3:.1f} GiB | libre: {free/1024**3:.1f} GiB")
+    tokenizer = train_or_load_tokenizer()
+    cfg       = GPTConfig(vocab_size=len(tokenizer))
+    if is_main():
+        CONFIG_FILE.write_text(json.dumps(asdict(cfg), indent=2, ensure_ascii=False), encoding="utf-8")
+    # ── 1. Créer le modèle ────────────────────────────────────────────────────
+    model = GPT(cfg).to(device)
+    # ── 2. Appliquer QLoRA APRÈS .to(device) ─────────────────────────────────
+    #    C'est la correction principale : lora_A/lora_B sont créés sur CUDA
+    if USE_QLORA:
+        model = apply_qlora(model, device)
+        freeze_base_weights(model)
+    maybe_load_base_checkpoint(model, device)
+    # ── 3. torch.compile (seulement si USE_CHECKPOINTING=False) ──────────────
+    #    La combinaison compile + checkpoint + LoRALinear custom est instable
+    #    avec torch.dynamo sur PyTorch 2.x. Choisir l'un ou l'autre.
+    if USE_COMPILE and not USE_CHECKPOINTING and hasattr(torch, "compile"):
+        try:
+            model = torch.compile(model, mode=COMPILE_MODE)
+            if is_main():
+                print(f"torch.compile activé ({COMPILE_MODE})")
+        except Exception as e:
+            if is_main():
+                print(f"[warn] torch.compile échoué ({e}) — poursuite sans compile")
+    # ── 4. DDP ────────────────────────────────────────────────────────────────
+    if is_distributed():
+        model = DDP(model, device_ids=[device.index])
+    optimizer = build_optimizer(model)
+    # ── Datasets ──────────────────────────────────────────────────────────────
+    eval_texts  = build_eval_texts()
+    eval_ds     = PackedTextList(eval_texts, tokenizer, cfg.block_size, SEED + 999)
+    eval_loader = make_loader(eval_ds, BATCH_SIZE, EVAL_NUM_WORKERS, is_cuda)
+    init_texts         = build_epoch_train_texts(0)
+    steps_per_epoch    = max(1, len(init_texts) // BATCH_SIZE)
+    total_steps_est    = steps_per_epoch * NUM_EPOCHS
+    # ── Reprise ───────────────────────────────────────────────────────────────
+    start_epoch, start_step, best_eval = 0, 0, 1e9
+    if STATE_FILE.exists():
+        try:
+            if is_main(): print(f"Reprise depuis {STATE_FILE}")
+            start_epoch, start_step, best_eval = load_resume_checkpoint(model, optimizer, STATE_FILE, device)
+        except Exception as e:
+            if is_main():
+                bad = STATE_FILE.with_suffix(".corrupt.pt")
+                print(f"[warn] Checkpoint illisible: {e}")
+                try: STATE_FILE.rename(bad)
+                except Exception: pass
+            start_epoch, start_step, best_eval = 0, 0, 1e9
+    if is_main():
+        raw     = unwrap_model(model)
+        n_total = count_parameters(raw, False)
+        n_train = count_parameters(raw, True)
+        print(f"Paramètres totaux    : {n_total/1e9:.3f}B")
+        print(f"Paramètres entraînés : {n_train/1e6:.1f}M ({100*n_train/max(1,n_total):.2f}%)")
+        print(f"Batch size   : {BATCH_SIZE} | Grad accum: {GRAD_ACCUM_STEPS} | Effective: {BATCH_SIZE*GRAD_ACCUM_STEPS}")
+        print(f"Steps estimés: {total_steps_est} | Eval texts: {len(eval_texts)}")
+        print()
+        print("── Conseil VRAM ───────────────────────────���────────────────────")
+        print("  Surveille 'max_reserved=XX GiB' à step 50.")
+        print("  Augmente BATCH_SIZE par +2 jusqu'à ~77 Go réservés.")
+        print("  Si OOM : BATCH_SIZE -= 1 ou USE_CHECKPOINTING=True.")
+        print("────────────────────────────────────────────────────────────────")
+    # ── Boucle d'entraînement ─────────────────────────────────────────────────
+    model.train()
+    optimizer.zero_grad(set_to_none=True)
+    global_step      = start_step
+    t0               = time.time()
+    log_loss_sum     = 0.0
+    log_loss_count   = 0
+    tokens_since_log = 0
+    last_log         = time.time()
+    if is_cuda:
+        torch.cuda.reset_peak_memory_stats(cuda_idx)
+    for epoch in range(start_epoch, NUM_EPOCHS):
+        if is_main():
+            print(f"\n{'='*20} Epoch {epoch+1}/{NUM_EPOCHS} {'='*20}")
+        train_texts  = build_epoch_train_texts(epoch)
+        train_ds     = PackedTextList(train_texts, tokenizer, cfg.block_size, SEED + epoch)
+        train_loader = make_loader(train_ds, BATCH_SIZE, TRAIN_NUM_WORKERS, is_cuda)
+        for micro_step, batch in enumerate(train_loader):
+            inp = batch["input_ids"].to(device, non_blocking=True)
+            lbl = batch["labels"].to(device, non_blocking=True)
+            with autocast_context(device):
+                _, loss = model(inp, lbl)
+            (loss / GRAD_ACCUM_STEPS).backward()
+            log_loss_sum     += loss.item()
+            log_loss_count   += 1
+            tokens_since_log += inp.numel()
+            if (micro_step + 1) % GRAD_ACCUM_STEPS != 0:
+                continue
+            lr = cosine_lr(global_step, total_steps_est)
+            for group in optimizer.param_groups:
+                group["lr"] = lr
+            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
+            optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+            global_step += 1
+            if global_step % 50 == 0 and is_main():
+                now      = time.time()
+                elapsed  = max(1e-6, now - last_log)
+                tok_s    = tokens_since_log / elapsed
+                avg_loss = log_loss_sum / max(1, log_loss_count)
+                print(
+                    f"ep {epoch+1}/{NUM_EPOCHS} | step={global_step:5d} | "
+                    f"loss={avg_loss:.4f} | lr={lr:.2e} | {tok_s:,.0f} tok/s"
+                )
+                if is_cuda:
+                    alloc     = torch.cuda.memory_allocated(cuda_idx)   / 1024**3
+                    reserved  = torch.cuda.memory_reserved(cuda_idx)    / 1024**3
+                    max_alloc = torch.cuda.max_memory_allocated(cuda_idx) / 1024**3
+                    max_res   = torch.cuda.max_memory_reserved(cuda_idx)  / 1024**3
+                    print(
+                        f"GPU mem | alloc={alloc:.2f} | reserved={reserved:.2f} | "
+                        f"max_alloc={max_alloc:.2f} | max_reserved={max_res:.2f}  (GiB)"
+                    )
+                last_log         = now
+                tokens_since_log = 0
+                log_loss_sum     = 0.0
+                log_loss_count   = 0
+            if global_step % EVAL_EVERY == 0 and is_main():
+                val_loss = evaluate(model, eval_loader, device)
+                print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}")
+                if val_loss < best_eval:
+                    best_eval = val_loss
+                    save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE)
+                    print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}")
+            if global_step % SAVE_EVERY == 0 and is_main():
+                save_checkpoint(model, optimizer, epoch, global_step, best_eval, STATE_FILE)
+                save_checkpoint(model, optimizer, epoch, global_step, best_eval, MODEL_FILE)
+                print(f"✓ Checkpoint → {MODEL_FILE}")
+        if is_main():
+            save_checkpoint(model, optimizer, epoch + 1, global_step, best_eval, STATE_FILE)
+            ckpt = OUT_DIR / f"model_epoch_{epoch+1:02d}.pt"
+            save_checkpoint(model, optimizer, epoch + 1, global_step, best_eval, ckpt)
+            print(f"✓ Fin epoch {epoch+1}/{NUM_EPOCHS} → {ckpt}")
+    if is_main():
+        save_checkpoint(model, optimizer, NUM_EPOCHS, global_step, best_eval, MODEL_FILE)
+        save_checkpoint(model, optimizer, NUM_EPOCHS, global_step, best_eval, STATE_FILE)
+        total_min = (time.time() - t0) / 60
+        print(f"\nModèle final    → {MODEL_FILE}")
+        print(f"Meilleur modèle → {BEST_MODEL_FILE}")
+        print(f"Temps total     : {total_min:.1f} min | Steps: {global_step}")
+    if is_distributed():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

train_aramix_h100_full.py ADDED Viewed

	@@ -0,0 +1,1055 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Entraînement LLM Multi-Domaine — H100 80 Go  [CORRIGÉ]
+════════════════════════════════════════════════════════
+Architecture  : GPT causal ~435M
+               RMSNorm · RoPE · SwiGLU · Flash Attention (SDPA)
+Précision     : BF16 natif + TF32 + fused AdamW
+Compilation   : torch.compile(mode="reduce-overhead")
+Dataset       : 10 domaines en streaming HF, échantillonnage pondéré
+Correctifs v2 :
+  ✓ trust_remote_code=True supprimé (déprécié datasets>=3.x)
+  ✓ wikipedia → datasets.load_dataset sans script legacy
+  ✓ Datasets remplacés par leurs équivalents Parquet/modernes
+  ✓ MAX_STEPS = 5 000  (2 epochs estimées sur corpus réduit)
+  ✓ pubmed_abstracts → pubmed_qa (Parquet natif)
+  ✓ RedPajama CC → allenai/c4 (en/fr/ar)
+  ✓ pile-of-law → joelniklaus/pile_of_law (Parquet)
+  ✓ Gestion propre de StopIteration dans le DataLoader
+Usage mono-GPU :
+    python train_aramix_h100_full.py
+"""
+from __future__ import annotations
+import copy
+import gc
+import json
+import math
+import os
+import random
+import time
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Dict, Iterator, List, Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, IterableDataset
+from datasets import load_dataset
+from tokenizers import (
+    Tokenizer,
+    decoders,
+    models,
+    normalizers,
+    pre_tokenizers,
+    processors,
+    trainers,
+)
+from transformers import PreTrainedTokenizerFast
+# ══════════════════════════════════════════════════════════════════
+# §1  CONFIGURATION GLOBALE
+# ══════════════════════════════════════════════════════════════════
+OUT_DIR = Path("./aramix_h100")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+SEED = 42
+# ── Tokenizer ─────────────────────────────────────────────────────
+TOKENIZER_DIR         = OUT_DIR / "tokenizer_32k"
+TOKENIZER_VOCAB       = 32_000
+TOKENIZER_SAMPLE_DOCS = 80_000   # réduit pour aller plus vite
+TOKENIZER_CHAR_LIMIT  = 2_000
+SPECIAL_TOKENS = ["<pad>", "<bos>", "<eos>", "<unk>"]
+PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN = SPECIAL_TOKENS
+# ── Architecture (~435M) ──────────────────────────────────────────
+VOCAB_SIZE  = 32_000   # mis à jour après chargement du tokenizer
+BLOCK_SIZE  = 1024
+D_MODEL     = 1024
+N_HEADS     = 18
+N_LAYERS    = 24
+D_FF        = 4096
+DROPOUT     = 0.1
+# ── Entraînement ──────────────────────────────────────────────────
+LEARNING_RATE    = 3e-4
+MIN_LR           = 3e-5
+WEIGHT_DECAY     = 0.1
+WARMUP_STEPS     = 200          # réduit proportionnellement (5k steps)
+MAX_STEPS        = 5_000        # ← 5 000 steps / ~2 epochs
+MAX_GRAD_NORM    = 1.0
+BATCH_SIZE          = 32        # H100 80 Go : 32×1024 en BF16 ≈ 26 Go
+GRAD_ACCUM_STEPS    = 1
+EVAL_EVERY          = 500
+SAVE_EVERY          = 1_000
+MAX_EVAL_DOCS_DOM   = 300
+TRAIN_CHAR_LIMIT    = 4_000
+# ── Précision & compilation ───────────────────────────────────────
+DTYPE          = torch.bfloat16
+USE_COMPILE    = True
+USE_CHECKPOINTING = False       # inutile avec 80 Go VRAM
+# ── Fichiers ─────────────────────────────────────────────────────
+MODEL_FILE      = OUT_DIR / "model.pt"
+BEST_MODEL_FILE = OUT_DIR / "model_best.pt"
+STATE_FILE      = OUT_DIR / "train_state.pt"
+CONFIG_FILE     = OUT_DIR / "config.json"
+LOG_FILE        = OUT_DIR / "train_log.jsonl"
+# ══════════════════════════════════════════════════════════════════
+# §2  REGISTRE MULTI-DOMAINES  (tous compatibles Parquet / sans script)
+# ══════════════════════════════════════════════════════════════════
+@dataclass
+class DomainConfig:
+    name:        str
+    hf_path:     str
+    hf_subset:   Optional[str]
+    hf_split:    str
+    text_field:  str
+    weight:      float
+    char_limit:  int
+    min_chars:   int  = 80
+    lang_filter: Optional[str] = None
+    # ── NOUVEAU : certains datasets nécessitent un champ imbriqué ──
+    extra_kwargs: dict = field(default_factory=dict)
+DOMAINS: List[DomainConfig] = [
+    # ── Code / Dev ────────────────────────────────────────────────
+    # bigcode/starcoderdata utilise un script MAIS accepte désormais
+    # le paramètre trust_remote_code=False via HF_DATASETS_TRUST_REMOTE_CODE=0
+    # → on passe par sa version Parquet directe sur HF hub
+    DomainConfig(
+        name="code_python",
+        hf_path="bigcode/the-stack-dedup",
+        hf_subset="data/python",
+        hf_split="train", text_field="content",
+        weight=0.12, char_limit=6_000, min_chars=100,
+    ),
+    DomainConfig(
+        name="code_js",
+        hf_path="bigcode/the-stack-dedup",
+        hf_subset="data/javascript",
+        hf_split="train", text_field="content",
+        weight=0.06, char_limit=5_000, min_chars=100,
+    ),
+    DomainConfig(
+        name="code_csharp",
+        hf_path="bigcode/the-stack-dedup",
+        hf_subset="data/c-sharp",
+        hf_split="train", text_field="content",
+        weight=0.04, char_limit=5_000, min_chars=100,
+    ),
+    # ── Médical ───────────────────────────────────────────────────
+    DomainConfig(
+        name="medical_flashcards",
+        hf_path="medalpaca/medical_meadow_medical_flashcards",
+        hf_subset=None, hf_split="train", text_field="output",
+        weight=0.06, char_limit=3_000, min_chars=60,
+    ),
+    # pubmed_abstracts → pubmed_qa (natif Parquet)
+    DomainConfig(
+        name="medical_pubmed",
+        hf_path="qiaojin/PubMedQA",
+        hf_subset="pqa_labeled",
+        hf_split="train", text_field="long_answer",
+        weight=0.06, char_limit=4_000, min_chars=100,
+    ),
+    # ── Français ──────────────────────────────────────────────────
+    # wikipedia sans script : utiliser la version datasets>=2.14 qui
+    # charge directement les Parquet sans script .py
+    DomainConfig(
+        name="french_wiki",
+        hf_path="wikimedia/wikipedia",
+        hf_subset="20231101.fr",
+        hf_split="train", text_field="text",
+        weight=0.08, char_limit=5_000, min_chars=100,
+    ),
+    DomainConfig(
+        name="french_culture",
+        hf_path="lyon-nlp/corpus-france-culture-inter-2023",
+        hf_subset=None, hf_split="train", text_field="text",
+        weight=0.04, char_limit=4_000, min_chars=80,
+    ),
+    DomainConfig(
+        name="french_news",
+        hf_path="mlsum", hf_subset="fr",
+        hf_split="train", text_field="text",
+        weight=0.03, char_limit=3_000, min_chars=80,
+    ),
+    # ── Arabe ─────────────────────────────────────────────────────
+    DomainConfig(
+        name="arabic_aramix",
+        hf_path="AdaMLLab/AraMix", hf_subset="matched",
+        hf_split="train", text_field="text",
+        weight=0.10, char_limit=4_000, min_chars=80,
+    ),
+    DomainConfig(
+        name="arabic_wiki",
+        hf_path="wikimedia/wikipedia",
+        hf_subset="20231101.ar",
+        hf_split="train", text_field="text",
+        weight=0.05, char_limit=4_000, min_chars=80,
+    ),
+    # OSCAR-2301 → oscar-corpus/OSCAR-2301 reste supporté sans script
+    DomainConfig(
+        name="arabic_oscar",
+        hf_path="oscar-corpus/OSCAR-2301", hf_subset="ar",
+        hf_split="train", text_field="content",
+        weight=0.04, char_limit=3_000, min_chars=80,
+    ),
+    # ── Créatif ───────────────────────────────────────────────────
+    DomainConfig(
+        name="creative_writing",
+        hf_path="ajibawa-2023/creative-writing-40k",
+        hf_subset=None, hf_split="train", text_field="output",
+        weight=0.05, char_limit=5_000, min_chars=100,
+    ),
+    DomainConfig(
+        name="stories",
+        hf_path="roneneldan/TinyStories",
+        hf_subset=None, hf_split="train", text_field="text",
+        weight=0.04, char_limit=3_000, min_chars=80,
+    ),
+    DomainConfig(
+        name="reddit_posts",
+        hf_path="webis/tldr-17",
+        hf_subset=None, hf_split="train", text_field="content",
+        weight=0.03, char_limit=3_000, min_chars=80,
+    ),
+    # ── Mathématiques ─────────────────────────────────────────────
+    DomainConfig(
+        name="math_stackexchange",
+        hf_path="math-ai/StackMathQA",
+        hf_subset=None, hf_split="train", text_field="A",
+        weight=0.04, char_limit=4_000, min_chars=80,
+    ),
+    DomainConfig(
+        name="math_problems",
+        hf_path="lighteval/MATH",
+        hf_subset=None, hf_split="train", text_field="solution",
+        weight=0.03, char_limit=3_000, min_chars=60,
+    ),
+    # ── Juridique ─────────────────────────────────────────────────
+    # joelniklaus/pile_of_law = version Parquet de pile-of-law
+    DomainConfig(
+        name="legal_en",
+        hf_path="joelniklaus/pile_of_law",
+        hf_subset="courtlistener_opinions",
+        hf_split="train", text_field="text",
+        weight=0.04, char_limit=5_000, min_chars=100,
+    ),
+    DomainConfig(
+        name="legal_fr",
+        hf_path="antoinelouis/french-legal-corpus",
+        hf_subset=None, hf_split="train", text_field="text",
+        weight=0.02, char_limit=4_000, min_chars=80,
+    ),
+    # ── Science ───────────────────────────────────────────────────
+    # RedPajama arxiv → allenai/peS2o (semantic scholar, Parquet)
+    DomainConfig(
+        name="science_arxiv",
+        hf_path="allenai/peS2o",
+        hf_subset=None,
+        hf_split="train", text_field="text",
+        weight=0.05, char_limit=6_000, min_chars=100,
+    ),
+    # ── Multilingue général ───────────────────────────────────────
+    # RedPajama CC → allenai/c4 multilingual (Parquet)
+    DomainConfig(
+        name="multilingual_cc",
+        hf_path="allenai/c4",
+        hf_subset="multilingual",
+        hf_split="train", text_field="text",
+        weight=0.02, char_limit=3_000, min_chars=80,
+    ),
+]
+# Validation : somme des poids ≈ 1.0
+_wsum = sum(d.weight for d in DOMAINS)
+assert abs(_wsum - 1.0) < 0.01, f"Somme des poids = {_wsum:.4f} ≠ 1.0"
+def select_domains(*names: str) -> List[DomainConfig]:
+    """Sous-sélection + renormalisation automatique des poids."""
+    selected = [d for d in DOMAINS if d.name in names]
+    if not selected:
+        raise ValueError(f"Aucun domaine parmi : {names}")
+    total = sum(d.weight for d in selected)
+    out = []
+    for d in selected:
+        dc = copy.copy(d)
+        dc.weight = round(d.weight / total, 6)
+        out.append(dc)
+    return out
+def print_domain_summary(domains: Optional[List[DomainConfig]] = None) -> None:
+    if domains is None:
+        domains = DOMAINS
+    print(f"\n{'Domaine':<25} {'Dataset HF':<45} {'Poids':>6}")
+    print("─" * 80)
+    for d in sorted(domains, key=lambda x: -x.weight):
+        sub = f"/{d.hf_subset}" if d.hf_subset else ""
+        print(f"{d.name:<25} {d.hf_path + sub:<45} {d.weight:>6.1%}")
+    print(f"{'TOTAL':<25} {'':<45} {sum(d.weight for d in domains):>6.1%}\n")
+# ══════════════════════════════════════════════════════════════════
+# §3  STREAMING DATASET MULTI-DOMAINES
+# ══════════════════════════════════════════════════════════════════
+def domain_text_stream(
+    domain: DomainConfig,
+    max_docs: Optional[int] = None,
+) -> Iterator[str]:
+    """
+    Charge, filtre et nettoie le texte brut d'un domaine HF en streaming.
+    CORRECTIF : trust_remote_code supprimé, gestion d'erreur par domaine
+    pour éviter de planter tout l'entraînement si un dataset échoue.
+    """
+    try:
+        ds = load_dataset(
+            domain.hf_path,
+            domain.hf_subset,
+            split=domain.hf_split,
+            streaming=True,
+            # trust_remote_code=True  ← SUPPRIMÉ (déprécié datasets>=3.x)
+        )
+    except Exception as e:
+        print(f"[WARN] Domaine '{domain.name}' impossible à charger : {e}")
+        return   # domaine ignoré proprement
+    n = 0
+    for row in ds:
+        text = row.get(domain.text_field, "")
+        if not text or not isinstance(text, str):
+            continue
+        if domain.lang_filter:
+            lang = row.get("lang", row.get("language", ""))
+            if lang and lang != domain.lang_filter:
+                continue
+        text = " ".join(text.strip().split())
+        if len(text) < domain.min_chars:
+            continue
+        yield text[: domain.char_limit]
+        n += 1
+        if max_docs and n >= max_docs:
+            break
+def interleaved_text_stream(
+    domains: List[DomainConfig],
+    max_docs_per_domain: Optional[int] = None,
+    seed: int = 42,
+) -> Iterator[Tuple[str, str]]:
+    """
+    Mélange stochastique pondéré des domaines.
+    Yield : (domain_name, text)
+    CORRECTIF : quand tous les domaines sont épuisés → StopIteration propre.
+    """
+    rng = random.Random(seed)
+    iters = {d.name: domain_text_stream(d, max_docs=max_docs_per_domain) for d in domains}
+    exhausted: set = set()
+    while len(exhausted) < len(domains):
+        active = [d for d in domains if d.name not in exhausted]
+        if not active:
+            break
+        chosen = rng.choices(active, weights=[d.weight for d in active], k=1)[0]
+        try:
+            yield chosen.name, next(iters[chosen.name])
+        except StopIteration:
+            exhausted.add(chosen.name)
+def packed_block_stream(
+    tokenizer: PreTrainedTokenizerFast,
+    domains: List[DomainConfig],
+    block_size: int,
+    max_docs_per_domain: Optional[int] = None,
+    seed: int = 42,
+) -> Iterator[Dict]:
+    """
+    Tokenise et pack les textes en blocs denses de block_size tokens.
+    Yield : {"input_ids": list[int], "labels": list[int], "domain": str}
+    """
+    bos, eos = tokenizer.bos_token_id, tokenizer.eos_token_id
+    buffer:        List[int] = []
+    buffer_domain: List[str] = []
+    for domain_name, text in interleaved_text_stream(domains, max_docs_per_domain, seed):
+        ids = tokenizer.encode(text, add_special_tokens=False)
+        if not ids:
+            continue
+        seq = [bos] + ids + [eos]
+        buffer.extend(seq)
+        buffer_domain.extend([domain_name] * len(seq))
+        while len(buffer) >= block_size + 1:
+            chunk        = buffer[:block_size + 1]
+            chunk_domain = buffer_domain[:block_size + 1]
+            buffer        = buffer[block_size + 1:]
+            buffer_domain = buffer_domain[block_size + 1:]
+            majority = max(set(chunk_domain), key=chunk_domain.count)
+            yield {"input_ids": chunk[:-1], "labels": chunk[1:], "domain": majority}
+class MultiDomainPackedDataset(IterableDataset):
+    """
+    IterableDataset multi-domaines avec sharding inter-workers.
+    Compatible DataLoader(num_workers=N).
+    """
+    def __init__(
+        self,
+        tokenizer: PreTrainedTokenizerFast,
+        domains: List[DomainConfig],
+        block_size: int,
+        max_docs_per_domain: Optional[int] = None,
+        seed: int = 42,
+    ):
+        super().__init__()
+        self.tokenizer           = tokenizer
+        self.domains             = domains
+        self.block_size          = block_size
+        self.max_docs_per_domain = max_docs_per_domain
+        self.seed                = seed
+    def __iter__(self):
+        worker    = torch.utils.data.get_worker_info()
+        wid       = worker.id          if worker else 0
+        n_workers = worker.num_workers if worker else 1
+        for idx, block in enumerate(packed_block_stream(
+            self.tokenizer, self.domains, self.block_size,
+            self.max_docs_per_domain, seed=self.seed + wid,
+        )):
+            if idx % n_workers != wid:
+                continue
+            yield {
+                "input_ids": torch.tensor(block["input_ids"], dtype=torch.long),
+                "labels":    torch.tensor(block["labels"],    dtype=torch.long),
+                "domain":    block["domain"],
+            }
+def build_dataloaders(
+    tokenizer: PreTrainedTokenizerFast,
+    domains: List[DomainConfig],
+    block_size: int,
+    train_batch_size: int,
+    eval_batch_size: int = 16,
+    max_eval_docs_per_dom: int = 300,
+    num_workers: int = 4,
+    seed: int = 42,
+) -> Tuple[DataLoader, DataLoader]:
+    train_ds = MultiDomainPackedDataset(
+        tokenizer, domains, block_size,
+        max_docs_per_domain=None,
+        seed=seed,
+    )
+    eval_ds = MultiDomainPackedDataset(
+        tokenizer, domains, block_size,
+        max_docs_per_domain=max_eval_docs_per_dom,
+        seed=seed + 9999,
+    )
+    def collate_fn(batch):
+        return {
+            "input_ids": torch.stack([b["input_ids"] for b in batch]),
+            "labels":    torch.stack([b["labels"]    for b in batch]),
+            "domain":    [b["domain"] for b in batch],
+        }
+    train_loader = DataLoader(
+        train_ds, batch_size=train_batch_size,
+        num_workers=num_workers, pin_memory=True,
+        prefetch_factor=2, collate_fn=collate_fn,
+    )
+    eval_loader = DataLoader(
+        eval_ds, batch_size=eval_batch_size,
+        num_workers=max(1, num_workers // 2), pin_memory=True,
+        prefetch_factor=2, collate_fn=collate_fn,
+    )
+    return train_loader, eval_loader
+# ══════════════════════════════════════════════════════════════════
+# §4  TOKENIZER BPE 32k
+# ══════════════════════════════════════════════════════════════════
+def train_or_load_tokenizer(
+    domains: List[DomainConfig],
+) -> PreTrainedTokenizerFast:
+    TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
+    tok_json = TOKENIZER_DIR / "tokenizer.json"
+    tok_cfg  = TOKENIZER_DIR / "tokenizer_config.json"
+    if tok_json.exists() and tok_cfg.exists():
+        print("Tokenizer existant chargé depuis le cache.")
+        return PreTrainedTokenizerFast.from_pretrained(str(TOKENIZER_DIR))
+    if is_main():
+        print("Entraînement tokenizer BPE 32k…")
+    def _iter_sample() -> Iterator[str]:
+        n_per_domain = max(1, TOKENIZER_SAMPLE_DOCS // len(domains))
+        for domain in domains:
+            for text in domain_text_stream(domain, max_docs=n_per_domain):
+                yield text
+    tok = Tokenizer(models.BPE(unk_token=UNK_TOKEN))
+    tok.normalizer    = normalizers.Sequence([normalizers.NFKC()])
+    tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    tok.decoder       = decoders.ByteLevel()
+    trainer = trainers.BpeTrainer(
+        vocab_size=TOKENIZER_VOCAB,
+        min_frequency=2,
+        show_progress=is_main(),
+        special_tokens=SPECIAL_TOKENS,
+        initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+    )
+    tok.train_from_iterator(_iter_sample(), trainer=trainer)
+    bos_id = tok.token_to_id(BOS_TOKEN)
+    eos_id = tok.token_to_id(EOS_TOKEN)
+    tok.post_processor = processors.TemplateProcessing(
+        single=f"{BOS_TOKEN} $A {EOS_TOKEN}",
+        pair=f"{BOS_TOKEN} $A {EOS_TOKEN} $B:1 {EOS_TOKEN}:1",
+        special_tokens=[(BOS_TOKEN, bos_id), (EOS_TOKEN, eos_id)],
+    )
+    tok.save(str(tok_json))
+    fast = PreTrainedTokenizerFast(
+        tokenizer_file=str(tok_json),
+        bos_token=BOS_TOKEN, eos_token=EOS_TOKEN,
+        unk_token=UNK_TOKEN, pad_token=PAD_TOKEN,
+    )
+    fast.save_pretrained(str(TOKENIZER_DIR))
+    smap = TOKENIZER_DIR / "special_tokens_map.json"
+    if not smap.exists():
+        smap.write_text(json.dumps({
+            "bos_token": BOS_TOKEN, "eos_token": EOS_TOKEN,
+            "unk_token": UNK_TOKEN, "pad_token": PAD_TOKEN,
+        }, indent=2), encoding="utf-8")
+    return PreTrainedTokenizerFast.from_pretrained(str(TOKENIZER_DIR))
+# ══════════════════════════════════════════════════════════════════
+# §5  ARCHITECTURE GPT
+# ══════════════════════════════════════════════════════════════════
+@dataclass
+class GPTConfig:
+    vocab_size:        int   = VOCAB_SIZE
+    block_size:        int   = BLOCK_SIZE
+    d_model:           int   = D_MODEL
+    n_heads:           int   = N_HEADS
+    n_layers:          int   = N_LAYERS
+    d_ff:              int   = D_FF
+    dropout:           float = DROPOUT
+    use_checkpointing: bool  = USE_CHECKPOINTING
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.weight * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: int = 10_000, max_seq: int = 4096):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(max_seq).float()
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cache", torch.repeat_interleave(freqs.cos(), 2, dim=-1), persistent=False)
+        self.register_buffer("sin_cache", torch.repeat_interleave(freqs.sin(), 2, dim=-1), persistent=False)
+    def forward(self, seq_len: int, dtype: torch.dtype) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.cos_cache[:seq_len].to(dtype), self.sin_cache[:seq_len].to(dtype)
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    return torch.stack((-x[..., 1::2], x[..., ::2]), dim=-1).flatten(-2)
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    return x * cos.unsqueeze(0).unsqueeze(0) + rotate_half(x) * sin.unsqueeze(0).unsqueeze(0)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        assert cfg.d_model % cfg.n_heads == 0
+        self.n_heads  = cfg.n_heads
+        self.head_dim = cfg.d_model // cfg.n_heads
+        self.qkv      = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.proj     = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout_p = cfg.dropout
+        self.rope      = RotaryEmbedding(self.head_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, t, c = x.shape
+        q, k, v = self.qkv(x).split(c, dim=-1)
+        q = q.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rope(t, x.dtype)
+        q, k = apply_rope(q, cos, sin), apply_rope(k, cos, sin)
+        # Flash Attention via SDPA (PyTorch ≥2.0, natif H100)
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            dropout_p=self.dropout_p if self.training else 0.0,
+            is_causal=True,
+        )
+        return self.proj(y.transpose(1, 2).contiguous().view(b, t, c))
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.w1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w2 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w3 = nn.Linear(cfg.d_ff,   cfg.d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class Block(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.ln1  = RMSNorm(cfg.d_model)
+        self.attn = CausalSelfAttention(cfg)
+        self.ln2  = RMSNorm(cfg.d_model)
+        self.ff   = SwiGLU(cfg)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ff(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.cfg     = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.blocks  = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.ln_f    = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        self.lm_head.weight = self.tok_emb.weight   # weight tying
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m: nn.Module) -> None:
+        if isinstance(m, (nn.Linear, nn.Embedding)):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.zeros_(m.bias)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        x = self.tok_emb(input_ids)
+        for block in self.blocks:
+            if self.cfg.use_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+        logits = self.lm_head(self.ln_f(x))
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                labels.reshape(-1),
+                ignore_index=-100,
+            )
+        return logits, loss
+# ══════════════════════════════════════════════════════════════════
+# §6  OPTIMIZER & LR SCHEDULE
+# ══════════════════════════════════════════════════════════════════
+def build_optimizer(model: nn.Module) -> torch.optim.Optimizer:
+    decay, no_decay = [], []
+    for name, p in model.named_parameters():
+        if not p.requires_grad:
+            continue
+        (decay if p.ndim >= 2 and "weight" in name else no_decay).append(p)
+    return torch.optim.AdamW(
+        [
+            {"params": decay,    "weight_decay": WEIGHT_DECAY},
+            {"params": no_decay, "weight_decay": 0.0},
+        ],
+        lr=LEARNING_RATE, betas=(0.9, 0.95), eps=1e-8,
+        fused=True,   # kernel unique GPU → +10-15% sur H100
+    )
+def cosine_lr(step: int) -> float:
+    if step < WARMUP_STEPS:
+        return LEARNING_RATE * step / max(1, WARMUP_STEPS)
+    p = min(1.0, (step - WARMUP_STEPS) / max(1, MAX_STEPS - WARMUP_STEPS))
+    return MIN_LR + 0.5 * (LEARNING_RATE - MIN_LR) * (1.0 + math.cos(math.pi * p))
+# ══════════════════════════════════════════════════════════════════
+# §7  CHECKPOINT
+# ══════════════════════════════════════════════════════════════════
+def save_checkpoint(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    step: int,
+    best_loss: float,
+    path: Path,
+) -> None:
+    raw = model.module if isinstance(model, DDP) else model
+    torch.save({
+        "model":     raw.state_dict(),
+        "optimizer": optimizer.state_dict(),
+        "step":      step,
+        "best_loss": best_loss,
+        "config":    asdict(raw.cfg),
+    }, path)
+def load_checkpoint(
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    path: Path,
+    device: torch.device,
+) -> Tuple[int, float]:
+    ckpt = torch.load(path, map_location=device)
+    raw  = model.module if isinstance(model, DDP) else model
+    raw.load_state_dict(ckpt["model"])
+    optimizer.load_state_dict(ckpt["optimizer"])
+    return int(ckpt.get("step", 0)), float(ckpt.get("best_loss", 1e9))
+# ══════════════════════════════════════════════════════════════════
+# §8  DDP HELPERS
+# ══════════════════════════════════════════════════════════════════
+def init_distributed() -> Optional[torch.device]:
+    local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if local_rank == -1:
+        return None
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    return torch.device(f"cuda:{local_rank}")
+def is_distributed() -> bool:
+    return dist.is_available() and dist.is_initialized()
+def get_rank() -> int:
+    return dist.get_rank() if is_distributed() else 0
+def get_world_size() -> int:
+    return dist.get_world_size() if is_distributed() else 1
+def is_main() -> bool:
+    return get_rank() == 0
+# ══════════════════════════════════════════════════════════════════
+# §9  ÉVALUATION
+# ══════════════════════════════════════════════════════════════════
+@torch.no_grad()
+def evaluate(
+    model: nn.Module,
+    loader: DataLoader,
+    device: torch.device,
+    max_batches: int = 80,
+) -> Tuple[float, Dict[str, float]]:
+    model.eval()
+    total_loss, total_n = 0.0, 0
+    domain_losses: Dict[str, list] = {}
+    for i, batch in enumerate(loader):
+        if i >= max_batches:
+            break
+        inp = batch["input_ids"].to(device)
+        lbl = batch["labels"].to(device)
+        domains_batch = batch["domain"]
+        with torch.autocast("cuda", dtype=DTYPE):
+            _, loss = model(inp, lbl)
+        lv = loss.item()
+        total_loss += lv
+        total_n    += 1
+        for dom in domains_batch:
+            domain_losses.setdefault(dom, []).append(lv)
+    model.train()
+    global_loss = total_loss / max(1, total_n)
+    per_domain  = {k: sum(v) / len(v) for k, v in domain_losses.items()}
+    return global_loss, per_domain
+# ══════════════════════════════════════════════════════════════════
+# §10  LOGGING
+# ══════════════════════════════════════════════════════════════════
+def log_jsonl(path: Path, record: dict) -> None:
+    with open(path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")
+# ══════════════════════════════════════════════════════════════════
+# §11  MAIN
+# ══════════════════════════════════════════════════════════════════
+def main() -> None:
+    # ── DDP init ──────────────────────────────────────────────────
+    ddp_device = init_distributed()
+    set_seed_fn(SEED + get_rank())
+    device = ddp_device if ddp_device else torch.device(
+        "cuda" if torch.cuda.is_available() else "cpu"
+    )
+    # ── Optimisations H100 ────────────────────────────────────────
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32       = True
+    if is_main():
+        print("=" * 60)
+        print(" LLM Multi-Domaine — H100 Training  [v2 CORRIGÉ]")
+        print("=" * 60)
+        gpu = torch.cuda.get_device_name(device) if device.type == "cuda" else "CPU"
+        print(f"Device  : {device}  ({gpu})")
+        print(f"GPUs    : {get_world_size()}")
+        print(f"Steps   : {MAX_STEPS} (~2 epochs sur corpus réduit)")
+        print_domain_summary()
+    # ── Tokenizer ─────────────────────────────────────────────────
+    tokenizer = train_or_load_tokenizer(DOMAINS)
+    vocab_size = len(tokenizer)
+    if is_main():
+        print(f"Tokenizer : {TOKENIZER_DIR} | vocab={vocab_size}")
+    # ── Modèle ────────────────────────────────────────────────────
+    cfg = GPTConfig(vocab_size=vocab_size)
+    if is_main():
+        CONFIG_FILE.write_text(
+            json.dumps(asdict(cfg), indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+    model = GPT(cfg).to(device)
+    if USE_COMPILE and device.type == "cuda":
+        model = torch.compile(model, mode="reduce-overhead")
+        if is_main():
+            print("torch.compile : reduce-overhead ✓")
+    if is_distributed():
+        model = DDP(model, device_ids=[device.index])
+    optimizer = build_optimizer(model)
+    # ── Reprise depuis checkpoint ──────────────────────────────────
+    start_step, best_eval = 0, 1e9
+    if STATE_FILE.exists():
+        if is_main():
+            print(f"Reprise depuis {STATE_FILE}")
+        start_step, best_eval = load_checkpoint(model, optimizer, STATE_FILE, device)
+        if is_main():
+            print(f"  → reprise à step {start_step}, best_loss={best_eval:.4f}")
+    # ── DataLoaders ───────────────────────────────────────────────
+    train_loader, eval_loader = build_dataloaders(
+        tokenizer             = tokenizer,
+        domains               = DOMAINS,
+        block_size            = BLOCK_SIZE,
+        train_batch_size      = BATCH_SIZE,
+        eval_batch_size       = max(1, BATCH_SIZE // 2),
+        max_eval_docs_per_dom = MAX_EVAL_DOCS_DOM,
+        num_workers           = 4,
+        seed                  = SEED,
+    )
+    # ── Résumé ────────────────────────────────────────────────────
+    if is_main():
+        raw = model.module if isinstance(model, DDP) else model
+        # comptage sans modules compilés
+        try:
+            n_params = sum(p.numel() for p in raw.parameters() if p.requires_grad)
+        except Exception:
+            n_params = -1
+        eff_batch = BATCH_SIZE * GRAD_ACCUM_STEPS * get_world_size()
+        print(f"Paramètres    : {n_params/1e6:.1f}M" if n_params > 0 else "Paramètres    : N/A (compilé)")
+        print(f"Architecture  : d={D_MODEL} | heads={N_HEADS} | layers={N_LAYERS} | ctx={BLOCK_SIZE}")
+        print(f"Batch effectif: {eff_batch} séq × {BLOCK_SIZE} tok = {eff_batch*BLOCK_SIZE:,} tok/step")
+        print(f"Dtype         : {DTYPE} | Steps : {MAX_STEPS} | Warmup : {WARMUP_STEPS}")
+        print("=" * 60)
+    # ── Boucle d'entraînement ─────────────────────────────────────
+    model.train()
+    optimizer.zero_grad(set_to_none=True)
+    train_iter      = iter(train_loader)
+    step            = start_step
+    t0              = time.time()
+    log_loss_sum    = 0.0
+    log_loss_n      = 0
+    tokens_log      = 0
+    last_log        = time.time()
+    while step < MAX_STEPS:
+        # ── gradient accumulation ──────────────────────────────────
+        for micro in range(GRAD_ACCUM_STEPS):
+            try:
+                batch = next(train_iter)
+            except StopIteration:
+                # ← CORRECTIF : relance l'itérateur proprement
+                train_iter = iter(train_loader)
+                try:
+                    batch = next(train_iter)
+                except StopIteration:
+                    print("[WARN] Dataset entièrement épuisé avant MAX_STEPS.")
+                    break
+            inp = batch["input_ids"].to(device, non_blocking=True)
+            lbl = batch["labels"].to(device, non_blocking=True)
+            with torch.autocast("cuda", dtype=DTYPE):
+                _, loss = model(inp, lbl)
+            (loss / GRAD_ACCUM_STEPS).backward()
+            log_loss_sum += loss.item()
+            log_loss_n   += 1
+            tokens_log   += inp.numel()
+        # ── optimizer step ────────────────────────────────────────
+        lr = cosine_lr(step)
+        for g in optimizer.param_groups:
+            g["lr"] = lr
+        torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+        step += 1
+        # ── logging toutes les 50 steps ───────────────────────────
+        if step % 50 == 0 and is_main():
+            now     = time.time()
+            elapsed = max(1e-6, now - last_log)
+            tok_s   = tokens_log / elapsed
+            avg_l   = log_loss_sum / max(1, log_loss_n)
+            ppl     = math.exp(min(avg_l, 20))
+            print(
+                f"step {step:5d}/{MAX_STEPS} | "
+                f"loss={avg_l:.4f} | ppl={ppl:.1f} | "
+                f"lr={lr:.2e} | {tok_s:,.0f} tok/s"
+            )
+            log_jsonl(LOG_FILE, {
+                "step": step, "loss": avg_l, "ppl": ppl,
+                "lr": lr, "tok_s": tok_s, "time": now - t0,
+            })
+            last_log     = now
+            tokens_log   = 0
+            log_loss_sum = 0.0
+            log_loss_n   = 0
+        # ── évaluation ────────────────────────────────────────────
+        if step % EVAL_EVERY == 0 and is_main():
+            val_loss, per_dom = evaluate(model, eval_loader, device)
+            ppl_val = math.exp(min(val_loss, 20))
+            print(f"\n[eval] step {step} | val_loss={val_loss:.4f} | ppl={ppl_val:.1f}")
+            print("  Perplexité par domaine :")
+            for dom, dl in sorted(per_dom.items(), key=lambda x: -x[1]):
+                print(f"    {dom:<25} loss={dl:.4f}  ppl={math.exp(min(dl,20)):.1f}")
+            print()
+            log_jsonl(LOG_FILE, {
+                "step": step, "val_loss": val_loss, "val_ppl": ppl_val,
+                "per_domain": per_dom,
+            })
+            if val_loss < best_eval:
+                best_eval = val_loss
+                save_checkpoint(model, optimizer, step, best_eval, BEST_MODEL_FILE)
+                print(f"  ✓ Meilleur modèle → {BEST_MODEL_FILE}\n")
+        # ── checkpoint périodique ─────────────────────────────────
+        if step % SAVE_EVERY == 0 and is_main():
+            save_checkpoint(model, optimizer, step, best_eval, STATE_FILE)
+            save_checkpoint(model, optimizer, step, best_eval, MODEL_FILE)
+            print(f"  ✓ Checkpoint step {step} → {MODEL_FILE}")
+    # ── Fin ───────────────────────────────────────────────────────
+    if is_main():
+        save_checkpoint(model, optimizer, step, best_eval, MODEL_FILE)
+        save_checkpoint(model, optimizer, step, best_eval, STATE_FILE)
+        total_min = (time.time() - t0) / 60
+        print(f"\n{'='*60}")
+        print(f"Modèle final   → {MODEL_FILE}")
+        print(f"Meilleur modèle→ {BEST_MODEL_FILE}")
+        print(f"Steps réalisés : {step}")
+        print(f"Temps total    : {total_min:.1f} min")
+        print(f"{'='*60}")
+    if is_distributed():
+        dist.destroy_process_group()
+# ══════════════════════════════════════════════════════════════════
+# §12  UTILS
+# ══════════════════════════════════════════════════════════════════
+def set_seed_fn(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+if __name__ == "__main__":
+    main()

train_nlp_h100_maxvram_v6.py ADDED Viewed

	@@ -0,0 +1,1046 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+train_nlp_h100_maxvram.py  — v6 (démarrage sûr sur H100 80G)
+============================================================
+Corrections principales vs v5
+-----------------------------
+1. Le log v5 montre que, même sans cap VRAM logiciel,
+   BATCH_SIZE=32 provoque un vrai OOM matériel dès le premier
+   forward compilé sur H100 80G.
+2. On abaisse donc le réglage de départ à :
+     BATCH_SIZE = 28
+   pour démarrer dans une zone sûre, puis remonter ensuite si le
+   log montre encore de la marge.
+3. Le cap logiciel reste désactivé par défaut :
+     TARGET_VRAM_GIB = None
+   afin d'éviter tout faux OOM dû à PyTorch.
+4. Le mode torch.compile reste sur :
+     COMPILE_MODE = "max-autotune-no-cudagraphs"
+   qui garde un bon compromis perf / mémoire sans surcoût CUDA graphs.
+5. BASE_CHECKPOINT est chargé AVANT l'injection LoRA,
+   et l'estimation des steps reste corrigée pour le scheduler LR.
+Conseils de réglage
+-------------------
+- Démarre avec :
+    BATCH_SIZE = 28
+    TARGET_VRAM_GIB = None
+    COMPILE_MODE = "max-autotune-no-cudagraphs"
+- Si stable et max_reserved < 72 GiB après quelques logs :
+    BATCH_SIZE += 2
+- Si vrai OOM matériel :
+    BATCH_SIZE -= 2 puis relance
+- Si tu veux encore plus de marge au premier essai :
+    BATCH_SIZE = 24
+"""
+from __future__ import annotations
+import itertools
+import json
+import math
+import os
+import random
+import time
+from collections import OrderedDict
+from contextlib import nullcontext
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterator, Optional
+# A définir AVANT import torch
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import bitsandbytes as bnb
+    HAS_BNB = True
+except ImportError:
+    HAS_BNB = False
+    print("[warn] bitsandbytes non disponible")
+try:
+    from flash_attn import flash_attn_func
+    HAS_FLASH = True
+except ImportError:
+    HAS_FLASH = False
+    print("[warn] flash-attn non disponible – fallback SDPA (toujours fusionné sur H100)")
+from datasets import load_dataset
+from torch.nn.parallel import DistributedDataParallel as DDP
+from tokenizers import (
+    Tokenizer, decoders, models, normalizers,
+    pre_tokenizers, processors, trainers,
+)
+from transformers import PreTrainedTokenizerFast
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  CHEMINS                                                                    ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+OUT_DIR         = Path("./nlp_1b_h100_maxvram")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+TOKENIZER_DIR   = OUT_DIR / "tokenizer_32k"
+CONFIG_FILE     = OUT_DIR / "config.json"
+MODEL_FILE      = OUT_DIR / "model.pt"
+BEST_MODEL_FILE = OUT_DIR / "model_best.pt"
+STATE_FILE      = OUT_DIR / "train_state.pt"
+BASE_CHECKPOINT: Optional[Path] = None
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  HYPERPARAMÈTRES — H100                                                     ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+SEED            = 42
+TARGET_VRAM_GIB = None   # None = pas de cap logiciel ; évite les faux OOM dus au cap PyTorch
+# ── Architecture ~1B ──────────────────────────────────────────────────────────
+BLOCK_SIZE = 2048
+VOCAB_SIZE = 32_000
+D_MODEL    = 1536
+N_HEADS    = 24
+N_LAYERS   = 24
+D_FF       = 6144
+DROPOUT    = 0.0
+# ── LoRA / "QLoRA" si BNB dispo côté optimiseur ──────────────────────────────
+USE_QLORA           = True
+LORA_R              = 64
+LORA_ALPHA          = 128
+LORA_DROPOUT        = 0.05
+LORA_TARGET_MODULES = ["qkv", "proj", "w1", "w2", "w3"]
+# ── Entraînement ──────────────────────────────────────────────────────────────
+NUM_EPOCHS    = 10
+LEARNING_RATE = 3e-4
+MIN_LR        = 3e-5
+WEIGHT_DECAY  = 0.1
+WARMUP_STEPS  = 500
+# Réglage de départ conseillé
+BATCH_SIZE       = 28
+GRAD_ACCUM_STEPS = 1
+MAX_GRAD_NORM    = 1.0
+EVAL_EVERY       = 500
+SAVE_EVERY       = 1_000
+DTYPE = torch.bfloat16
+# Compile : version plus robuste au démarrage
+USE_CHECKPOINTING = False
+USE_COMPILE       = True
+COMPILE_MODE      = "max-autotune-no-cudagraphs"
+# ── DataLoader ────────────────────────────────────────────────────────────────
+TRAIN_NUM_WORKERS = 8
+EVAL_NUM_WORKERS  = 4
+PREFETCH_FACTOR   = 4
+# ── Textes ────────────────────────────────────────────────────────────────────
+TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000
+TOKENIZER_CHAR_LIMIT             = 2_000
+TEXT_CHAR_LIMIT                  = 8_000
+SPECIAL_TOKENS = ["<pad>", "<bos>", "<eos>", "<unk>"]
+PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN = SPECIAL_TOKENS
+WIKI_CONFIGS               = ["20231101.en", "20231101.fr", "20231101.ar"]
+FINEWEB_CONFIG             = "sample-10BT"
+DEV_DOCS_PER_WIKI_CONFIG   = 1_500
+DEV_DOCS_FINEWEB           = 3_000
+TRAIN_DOCS_PER_WIKI_CONFIG = 30_000
+TRAIN_DOCS_FINEWEB         = 60_000
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DISTRIBUTED                                                                ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def is_distributed() -> bool:
+    return dist.is_available() and dist.is_initialized()
+def get_rank() -> int:
+    return dist.get_rank() if is_distributed() else 0
+def get_world_size() -> int:
+    return dist.get_world_size() if is_distributed() else 1
+def is_main() -> bool:
+    return get_rank() == 0
+def init_distributed() -> Optional[torch.device]:
+    lr = int(os.environ.get("LOCAL_RANK", -1))
+    if lr == -1:
+        return None
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(lr)
+    return torch.device(f"cuda:{lr}")
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def get_device(ddp=None) -> torch.device:
+    if ddp is not None:
+        return ddp
+    if torch.cuda.is_available():
+        return torch.device(f"cuda:{torch.cuda.current_device()}")
+    return torch.device("cpu")
+def current_cuda_index(device: torch.device) -> int:
+    return device.index if device.index is not None else torch.cuda.current_device()
+def autocast_context(device: torch.device):
+    return torch.autocast("cuda", dtype=DTYPE) if device.type == "cuda" else nullcontext()
+def unwrap_model(model: nn.Module) -> nn.Module:
+    m = model.module if isinstance(model, DDP) else model
+    return m._orig_mod if hasattr(m, "_orig_mod") else m
+def count_parameters(model: nn.Module, trainable_only: bool = True) -> int:
+    return sum(p.numel() for p in model.parameters() if not trainable_only or p.requires_grad)
+def normalize_state_dict_keys(sd: dict) -> OrderedDict:
+    out = OrderedDict()
+    for k, v in sd.items():
+        for prefix in ("module._orig_mod.", "_orig_mod.", "module."):
+            if k.startswith(prefix):
+                k = k[len(prefix):]
+                break
+        out[k] = v
+    return out
+def normalize_text(t: str) -> str:
+    return " ".join(t.strip().split())
+def safe_str(x) -> str:
+    return x if isinstance(x, str) else ("" if x is None else str(x))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DATASETS                                                                   ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def load_wiki_stream(cfg_name: str):
+    return load_dataset("wikimedia/wikipedia", cfg_name, split="train", streaming=True)
+def load_fineweb_stream():
+    return load_dataset("HuggingFaceFW/fineweb-edu", FINEWEB_CONFIG, split="train", streaming=True)
+def stream_texts(ds, start: int, count: int, char_limit: int) -> Iterator[str]:
+    for row in itertools.islice(ds, start, start + count):
+        text = normalize_text(safe_str(row.get("text", "")))
+        if len(text) >= 20:
+            yield text[:char_limit]
+def tokenizer_training_iterator() -> Iterator[str]:
+    for c in WIKI_CONFIGS:
+        yield from stream_texts(
+            load_wiki_stream(c),
+            0,
+            TOKENIZER_SAMPLE_DOCS_PER_SOURCE,
+            TOKENIZER_CHAR_LIMIT,
+        )
+    yield from stream_texts(
+        load_fineweb_stream(),
+        0,
+        TOKENIZER_SAMPLE_DOCS_PER_SOURCE,
+        TOKENIZER_CHAR_LIMIT,
+    )
+def build_epoch_train_texts(epoch: int) -> list[str]:
+    texts: list[str] = []
+    for c in WIKI_CONFIGS:
+        start = DEV_DOCS_PER_WIKI_CONFIG + epoch * TRAIN_DOCS_PER_WIKI_CONFIG
+        texts.extend(stream_texts(load_wiki_stream(c), start, TRAIN_DOCS_PER_WIKI_CONFIG, TEXT_CHAR_LIMIT))
+    start = DEV_DOCS_FINEWEB + epoch * TRAIN_DOCS_FINEWEB
+    texts.extend(stream_texts(load_fineweb_stream(), start, TRAIN_DOCS_FINEWEB, TEXT_CHAR_LIMIT))
+    random.Random(SEED + epoch).shuffle(texts)
+    return texts
+def build_eval_texts() -> list[str]:
+    texts: list[str] = []
+    for c in WIKI_CONFIGS:
+        texts.extend(stream_texts(load_wiki_stream(c), 0, DEV_DOCS_PER_WIKI_CONFIG, TEXT_CHAR_LIMIT))
+    texts.extend(stream_texts(load_fineweb_stream(), 0, DEV_DOCS_FINEWEB, TEXT_CHAR_LIMIT))
+    return texts
+def estimate_steps_per_epoch(
+    texts: list[str],
+    tokenizer: PreTrainedTokenizerFast,
+    block_size: int,
+    batch_size: int,
+    sample_size: int = 512,
+) -> int:
+    """
+    Estimation des steps réels pour le scheduler LR.
+    Chaque texte contribue environ len(ids)+2 tokens (BOS/EOS).
+    Un exemple packed consomme block_size+1 tokens.
+    """
+    if not texts:
+        return 1
+    rng = random.Random(SEED)
+    if len(texts) > sample_size:
+        sample = rng.sample(texts, sample_size)
+    else:
+        sample = texts
+    total_tokens = 0
+    valid = 0
+    for txt in sample:
+        ids = tokenizer.encode(txt, add_special_tokens=False)
+        if ids:
+            total_tokens += len(ids) + 2
+            valid += 1
+    avg_tokens_per_text = total_tokens / max(1, valid)
+    est_epoch_tokens = avg_tokens_per_text * len(texts)
+    tokens_per_step = (block_size + 1) * batch_size * get_world_size()
+    return max(1, int(est_epoch_tokens // max(1, tokens_per_step)))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  PACKED DATASET                                                             ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+class PackedTextList(torch.utils.data.IterableDataset):
+    """
+    Packing dense sans padding.
+    drop_last=True dans le DataLoader → shapes constantes, utile pour compile.
+    """
+    def __init__(self, texts, tokenizer, block_size, epoch_seed=0):
+        super().__init__()
+        self.texts      = texts
+        self.tokenizer  = tokenizer
+        self.block_size = block_size
+        self.epoch_seed = epoch_seed
+    def __iter__(self):
+        worker   = torch.utils.data.get_worker_info()
+        rank, ws = get_rank(), get_world_size()
+        if worker is None:
+            shard_mod, shard_id = ws, rank
+        else:
+            shard_mod = worker.num_workers * ws
+            shard_id  = rank * worker.num_workers + worker.id
+        rng = random.Random(self.epoch_seed)
+        indices = list(range(len(self.texts)))
+        rng.shuffle(indices)
+        bos, eos = self.tokenizer.bos_token_id, self.tokenizer.eos_token_id
+        buf: list[int] = []
+        for li, ti in enumerate(indices):
+            if li % shard_mod != shard_id:
+                continue
+            ids = self.tokenizer.encode(self.texts[ti], add_special_tokens=False)
+            if not ids:
+                continue
+            buf.extend([bos] + ids + [eos])
+            while len(buf) >= self.block_size + 1:
+                chunk = buf[: self.block_size + 1]
+                buf   = buf[self.block_size + 1 :]
+                yield {
+                    "input_ids": torch.tensor(chunk[:-1], dtype=torch.long),
+                    "labels":    torch.tensor(chunk[1:],  dtype=torch.long),
+                }
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  TOKENIZER                                                                  ║
+# ╚═══════════════════════════════════════════════════════════════════════════��══╝
+def tokenizer_ready() -> bool:
+    return (
+        (TOKENIZER_DIR / "tokenizer.json").exists()
+        and (TOKENIZER_DIR / "tokenizer_config.json").exists()
+    )
+def train_tokenizer_once() -> None:
+    TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
+    tok = Tokenizer(models.BPE(unk_token=UNK_TOKEN))
+    tok.normalizer    = normalizers.Sequence([normalizers.NFKC()])
+    tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    tok.decoder       = decoders.ByteLevel()
+    trainer = trainers.BpeTrainer(
+        vocab_size=VOCAB_SIZE,
+        min_frequency=2,
+        show_progress=is_main(),
+        special_tokens=SPECIAL_TOKENS,
+        initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+    )
+    tok.train_from_iterator(tokenizer_training_iterator(), trainer=trainer)
+    bos_id = tok.token_to_id(BOS_TOKEN)
+    eos_id = tok.token_to_id(EOS_TOKEN)
+    tok.post_processor = processors.TemplateProcessing(
+        single=f"{BOS_TOKEN} $A {EOS_TOKEN}",
+        pair=f"{BOS_TOKEN} $A {EOS_TOKEN} $B:1 {EOS_TOKEN}:1",
+        special_tokens=[(BOS_TOKEN, bos_id), (EOS_TOKEN, eos_id)],
+    )
+    tok.save(str(TOKENIZER_DIR / "tokenizer.json"))
+    fast = PreTrainedTokenizerFast(
+        tokenizer_file=str(TOKENIZER_DIR / "tokenizer.json"),
+        bos_token=BOS_TOKEN,
+        eos_token=EOS_TOKEN,
+        unk_token=UNK_TOKEN,
+        pad_token=PAD_TOKEN,
+    )
+    fast.save_pretrained(str(TOKENIZER_DIR))
+def train_or_load_tokenizer() -> PreTrainedTokenizerFast:
+    TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
+    if not tokenizer_ready():
+        if is_distributed():
+            if is_main():
+                print("Entraînement tokenizer 32k…")
+                train_tokenizer_once()
+            dist.barrier()
+        else:
+            print("Entraînement tokenizer 32k…")
+            train_tokenizer_once()
+    return PreTrainedTokenizerFast.from_pretrained(str(TOKENIZER_DIR))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  MODÈLE GPT                                                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+@dataclass
+class GPTConfig:
+    vocab_size: int   = VOCAB_SIZE
+    block_size: int   = BLOCK_SIZE
+    d_model:    int   = D_MODEL
+    n_heads:    int   = N_HEADS
+    n_layers:   int   = N_LAYERS
+    d_ff:       int   = D_FF
+    dropout:    float = DROPOUT
+    use_checkpointing: bool = USE_CHECKPOINTING
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps    = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.weight * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: int = 10_000, max_seq: int = 8_192):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(max_seq).float()
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cache", torch.repeat_interleave(freqs.cos(), 2, dim=-1), persistent=False)
+        self.register_buffer("sin_cache", torch.repeat_interleave(freqs.sin(), 2, dim=-1), persistent=False)
+    def forward(self, seq_len: int, dtype: torch.dtype):
+        return self.cos_cache[:seq_len].to(dtype), self.sin_cache[:seq_len].to(dtype)
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    return x * cos.unsqueeze(0).unsqueeze(0) + rotate_half(x) * sin.unsqueeze(0).unsqueeze(0)
+class CausalSelfAttention(nn.Module):
+    """
+    Flash Attention 2 si disponible.
+    Sinon SDPA PyTorch, fusionné et performant sur H100.
+    """
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        assert cfg.d_model % cfg.n_heads == 0
+        self.n_heads   = cfg.n_heads
+        self.head_dim  = cfg.d_model // cfg.n_heads
+        self.qkv       = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.proj      = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout_p = cfg.dropout
+        self.rope      = RotaryEmbedding(self.head_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        b, t, c = x.shape
+        q, k, v = self.qkv(x).split(c, dim=-1)
+        q = q.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rope(t, x.dtype)
+        q, k = apply_rope(q, cos, sin), apply_rope(k, cos, sin)
+        if HAS_FLASH:
+            q = q.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
+            y = flash_attn_func(
+                q,
+                k,
+                v,
+                dropout_p=self.dropout_p if self.training else 0.0,
+                causal=True,
+            )
+            y = y.reshape(b, t, c)
+        else:
+            y = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                dropout_p=self.dropout_p if self.training else 0.0,
+                is_causal=True,
+            )
+            y = y.transpose(1, 2).contiguous().view(b, t, c)
+        return self.proj(y)
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.w1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w2 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w3 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class Block(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.ln1  = RMSNorm(cfg.d_model)
+        self.attn = CausalSelfAttention(cfg)
+        self.ln2  = RMSNorm(cfg.d_model)
+        self.ff   = SwiGLU(cfg)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ff(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.cfg     = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.blocks  = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.ln_f    = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        self.lm_head.weight = self.tok_emb.weight
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m: nn.Module) -> None:
+        if isinstance(m, (nn.Linear, nn.Embedding)):
+            nn.init.normal_(m.weight, 0.0, 0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.zeros_(m.bias)
+    def forward(self, input_ids: torch.Tensor, labels: Optional[torch.Tensor] = None):
+        x = self.tok_emb(input_ids)
+        for block in self.blocks:
+            if self.cfg.use_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+        logits = self.lm_head(self.ln_f(x))
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                labels.reshape(-1),
+                ignore_index=-100,
+            )
+        return logits, loss
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  LoRA                                                                       ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+class LoRALinear(nn.Module):
+    def __init__(self, base_layer: nn.Linear, r=LORA_R, alpha=LORA_ALPHA, dropout=LORA_DROPOUT):
+        super().__init__()
+        self.base  = base_layer
+        self.r     = r
+        self.scale = alpha / r
+        in_f, out_f = base_layer.in_features, base_layer.out_features
+        try:
+            dev = next(base_layer.parameters()).device
+        except StopIteration:
+            dev = torch.device("cpu")
+        self.lora_A = nn.Linear(in_f, r, bias=False, device=dev)
+        self.lora_B = nn.Linear(r, out_f, bias=False, device=dev)
+        self.drop   = nn.Dropout(dropout)
+        nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_B.weight)
+        for p in self.base.parameters():
+            p.requires_grad = False
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.base(x) + self.lora_B(self.lora_A(self.drop(x))) * self.scale
+def apply_qlora(model: GPT, device: torch.device) -> GPT:
+    if not USE_QLORA:
+        return model
+    targets = [
+        (name, module)
+        for name, module in model.named_modules()
+        if name.split(".")[-1] in LORA_TARGET_MODULES and isinstance(module, nn.Linear)
+    ]
+    for name, module in targets:
+        parts  = name.split(".")
+        parent = model
+        for part in parts[:-1]:
+            parent = getattr(parent, part)
+        setattr(parent, parts[-1], LoRALinear(module))
+    if is_main():
+        print(f"LoRA : {len(targets)} couches remplacées (device={device})")
+    return model
+def freeze_base_weights(model: GPT) -> None:
+    for name, p in model.named_parameters():
+        p.requires_grad = ("lora_A" in name or "lora_B" in name)
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  OPTIMIZER                                                                  ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def build_optimizer(model: nn.Module) -> torch.optim.Optimizer:
+    decay, no_decay = [], []
+    for name, p in unwrap_model(model).named_parameters():
+        if not p.requires_grad:
+            continue
+        if p.ndim >= 2 and "weight" in name:
+            decay.append(p)
+        else:
+            no_decay.append(p)
+    groups = [
+        {"params": decay,    "weight_decay": WEIGHT_DECAY},
+        {"params": no_decay, "weight_decay": 0.0},
+    ]
+    if HAS_BNB:
+        return bnb.optim.PagedAdamW8bit(
+            groups,
+            lr=LEARNING_RATE,
+            betas=(0.9, 0.95),
+            eps=1e-8,
+        )
+    return torch.optim.AdamW(
+        groups,
+        lr=LEARNING_RATE,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+        fused=torch.cuda.is_available(),
+    )
+def cosine_lr(step: int, total_steps: int) -> float:
+    if step < WARMUP_STEPS:
+        return LEARNING_RATE * step / max(1, WARMUP_STEPS)
+    p = min(1.0, (step - WARMUP_STEPS) / max(1, total_steps - WARMUP_STEPS))
+    return MIN_LR + 0.5 * (LEARNING_RATE - MIN_LR) * (1 + math.cos(math.pi * p))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  CHECKPOINT                                                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def save_checkpoint(model, optimizer, epoch, step, best_loss, path):
+    raw = unwrap_model(model)
+    torch.save(
+        {
+            "model": normalize_state_dict_keys(raw.state_dict()),
+            "optimizer": optimizer.state_dict(),
+            "epoch": epoch,
+            "step": step,
+            "best_loss": best_loss,
+            "config": asdict(raw.cfg),
+        },
+        path,
+    )
+def maybe_load_base_checkpoint(model, device):
+    if BASE_CHECKPOINT is None or not Path(BASE_CHECKPOINT).exists():
+        return
+    ckpt = torch.load(BASE_CHECKPOINT, map_location=device)
+    missing, unexpected = model.load_state_dict(normalize_state_dict_keys(ckpt["model"]), strict=False)
+    if is_main():
+        print(f"Base checkpoint chargé depuis {BASE_CHECKPOINT}")
+        if missing:
+            print(f"[warn] missing keys base ckpt: {len(missing)}")
+        if unexpected:
+            print(f"[warn] unexpected keys base ckpt: {len(unexpected)}")
+def load_resume_checkpoint(model, optimizer, path, device):
+    ckpt = torch.load(path, map_location=device)
+    unwrap_model(model).load_state_dict(normalize_state_dict_keys(ckpt["model"]), strict=True)
+    try:
+        optimizer.load_state_dict(ckpt["optimizer"])
+    except Exception as e:
+        print(f"[warn] Optimizer state non repris: {e}")
+    return (
+        int(ckpt.get("epoch", 0)),
+        int(ckpt.get("step", 0)),
+        float(ckpt.get("best_loss", 1e9)),
+    )
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  ÉVALUATION                                                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+@torch.no_grad()
+def evaluate(model, loader, device, max_batches=100) -> float:
+    model.eval()
+    losses = []
+    for i, batch in enumerate(loader):
+        if i >= max_batches:
+            break
+        inp = batch["input_ids"].to(device, non_blocking=True)
+        lbl = batch["labels"].to(device, non_blocking=True)
+        with autocast_context(device):
+            _, loss = model(inp, lbl)
+        losses.append(loss.item())
+    model.train()
+    return sum(losses) / max(1, len(losses))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DATALOADER                                                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def make_loader(dataset, batch_size, num_workers, is_cuda, drop_last=True):
+    kwargs = dict(
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=is_cuda,
+        drop_last=drop_last,
+    )
+    if num_workers > 0:
+        kwargs["persistent_workers"] = True
+        kwargs["prefetch_factor"] = PREFETCH_FACTOR
+    return torch.utils.data.DataLoader(dataset, **kwargs)
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  CUDA / LOGGING                                                             ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def maybe_limit_process_memory(device: torch.device) -> tuple[Optional[int], Optional[float]]:
+    if device.type != "cuda":
+        return None, None
+    cuda_idx = current_cuda_index(device)
+    if TARGET_VRAM_GIB is None:
+        return cuda_idx, None
+    _, total = torch.cuda.mem_get_info(cuda_idx)
+    vram_fraction = min(TARGET_VRAM_GIB * (1024**3) / total, 0.98)
+    torch.cuda.memory.set_per_process_memory_fraction(vram_fraction, device=cuda_idx)
+    return cuda_idx, vram_fraction
+def sync_if_cuda(device: torch.device) -> None:
+    if device.type == "cuda":
+        torch.cuda.synchronize(current_cuda_index(device))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  MAIN                                                                       ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def main() -> None:
+    ddp_device = init_distributed()
+    set_seed(SEED + get_rank())
+    device  = get_device(ddp_device)
+    is_cuda = device.type == "cuda"
+    if is_cuda:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32       = True
+        torch.set_float32_matmul_precision("high")
+        torch.backends.cudnn.benchmark        = True
+    cuda_idx, vram_fraction = maybe_limit_process_memory(device)
+    if is_main():
+        print("=" * 72)
+        print(" GPT ~1B | H100 MAX VRAM | LoRA + BF16 + TF32 + compile | v4")
+        print("=" * 72)
+        print(f"Device      : {device} | World: {get_world_size()} GPU(s)")
+        print(f"Flash-2     : {HAS_FLASH} | BNB: {HAS_BNB} | LoRA: {USE_QLORA}")
+        print(f"Grad ckpt   : {USE_CHECKPOINTING} | Compile: {USE_COMPILE} ({COMPILE_MODE})")
+        print(f"BLOCK_SIZE  : {BLOCK_SIZE} | BATCH_SIZE: {BATCH_SIZE} | GRAD_ACCUM: {GRAD_ACCUM_STEPS}")
+        print(f"Tokens/step : {BLOCK_SIZE * BATCH_SIZE * GRAD_ACCUM_STEPS:,}")
+        if is_cuda:
+            free, total = torch.cuda.mem_get_info(cuda_idx)
+            print(f"GPU         : {torch.cuda.get_device_name(cuda_idx)}")
+            print(f"VRAM        : {total/1024**3:.1f} GiB | libre: {free/1024**3:.1f} GiB")
+            if vram_fraction is None:
+                print("Cap VRAM    : désactivé")
+            else:
+                print(f"Cap VRAM    : {TARGET_VRAM_GIB:.1f} GiB ({100*vram_fraction:.1f}% du device)")
+    tokenizer = train_or_load_tokenizer()
+    cfg = GPTConfig(vocab_size=len(tokenizer))
+    if is_main():
+        CONFIG_FILE.write_text(json.dumps(asdict(cfg), indent=2, ensure_ascii=False), encoding="utf-8")
+    # 1) Base model sur GPU
+    model = GPT(cfg).to(device)
+    # 2) Charger éventuel checkpoint base AVANT LoRA
+    maybe_load_base_checkpoint(model, device)
+    # 3) Injecter LoRA ensuite
+    if USE_QLORA:
+        model = apply_qlora(model, device)
+        freeze_base_weights(model)
+    # 4) Compiler le modèle ensuite
+    if USE_COMPILE and hasattr(torch, "compile"):
+        if is_main():
+            print(f"Compilation torch.compile({COMPILE_MODE})…")
+        try:
+            model = torch.compile(model, mode=COMPILE_MODE, fullgraph=False)
+            if is_main():
+                print("torch.compile : OK")
+        except Exception as e:
+            if is_main():
+                print(f"[warn] torch.compile échoué ({e}) — fallback eager")
+    # 5) DDP
+    if is_distributed():
+        model = DDP(model, device_ids=[device.index])
+    optimizer = build_optimizer(model)
+    # ── Datasets ──────────────────────────────────────────────────────────────
+    eval_texts  = build_eval_texts()
+    eval_ds     = PackedTextList(eval_texts, tokenizer, cfg.block_size, SEED + 999)
+    eval_loader = make_loader(eval_ds, BATCH_SIZE, EVAL_NUM_WORKERS, is_cuda, drop_last=False)
+    init_texts       = build_epoch_train_texts(0)
+    steps_per_epoch  = estimate_steps_per_epoch(init_texts, tokenizer, cfg.block_size, BATCH_SIZE * GRAD_ACCUM_STEPS)
+    total_steps_est  = max(steps_per_epoch * NUM_EPOCHS, WARMUP_STEPS + 100)
+    # ── Reprise ───────────────────────────────────────────────────────────────
+    start_epoch, start_step, best_eval = 0, 0, 1e9
+    if STATE_FILE.exists():
+        try:
+            if is_main():
+                print(f"Reprise depuis {STATE_FILE}")
+            start_epoch, start_step, best_eval = load_resume_checkpoint(model, optimizer, STATE_FILE, device)
+        except Exception as e:
+            if is_main():
+                try:
+                    STATE_FILE.rename(STATE_FILE.with_suffix(".corrupt.pt"))
+                except Exception:
+                    pass
+                print(f"[warn] Checkpoint illisible ({e}) — reprise ignorée")
+            start_epoch, start_step, best_eval = 0, 0, 1e9
+    if is_main():
+        raw = unwrap_model(model)
+        n_total = count_parameters(raw, False)
+        n_train = count_parameters(raw, True)
+        effective_bs = BATCH_SIZE * GRAD_ACCUM_STEPS * get_world_size()
+        print(f"\nParamètres totaux    : {n_total/1e9:.3f}B")
+        print(f"Paramètres entraînés : {n_train/1e6:.1f}M ({100*n_train/max(1, n_total):.2f}%)")
+        print(f"Batch effectif       : {effective_bs} ({BATCH_SIZE}×{GRAD_ACCUM_STEPS}×{get_world_size()} GPU)")
+        print(f"Tokens/step          : {BLOCK_SIZE * effective_bs:,}")
+        print(f"Steps estimés        : {total_steps_est:,}")
+        print()
+        print("┌── Pilotage VRAM ──────────────────────────────────────────────┐")
+        print("│  Lis 'max_reserved' après quelques logs :                    │")
+        print("│    < 72 GiB → +2 BS   | 72–77.5 GiB → zone cible           │")
+        print("│    vrai OOM → -2 BS   | puis relance                        │")
+        print("└───────────────────────────────────────────────────────────────┘")
+    # ── Boucle principale ─────────────────────────────────────────────────────
+    model.train()
+    optimizer.zero_grad(set_to_none=True)
+    global_step      = start_step
+    t0               = time.time()
+    log_loss_sum     = 0.0
+    log_loss_count   = 0
+    tokens_since_log = 0
+    last_log         = time.time()
+    if is_cuda:
+        torch.cuda.reset_peak_memory_stats(cuda_idx)
+    for epoch in range(start_epoch, NUM_EPOCHS):
+        if is_main():
+            print(f"\n{'='*20} Epoch {epoch+1}/{NUM_EPOCHS} {'='*20}")
+        train_texts  = build_epoch_train_texts(epoch)
+        train_ds     = PackedTextList(train_texts, tokenizer, cfg.block_size, SEED + epoch)
+        train_loader = make_loader(train_ds, BATCH_SIZE, TRAIN_NUM_WORKERS, is_cuda, drop_last=True)
+        for micro_step, batch in enumerate(train_loader):
+            inp = batch["input_ids"].to(device, non_blocking=True)
+            lbl = batch["labels"].to(device, non_blocking=True)
+            with autocast_context(device):
+                _, loss = model(inp, lbl)
+            (loss / GRAD_ACCUM_STEPS).backward()
+            log_loss_sum     += loss.item()
+            log_loss_count   += 1
+            tokens_since_log += inp.numel()
+            if (micro_step + 1) % GRAD_ACCUM_STEPS != 0:
+                continue
+            lr = cosine_lr(global_step, total_steps_est)
+            for group in optimizer.param_groups:
+                group["lr"] = lr
+            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
+            optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+            global_step += 1
+            if global_step % 50 == 0 and is_main():
+                sync_if_cuda(device)
+                now      = time.time()
+                elapsed  = max(1e-6, now - last_log)
+                tok_s    = tokens_since_log / elapsed
+                avg_loss = log_loss_sum / max(1, log_loss_count)
+                print(
+                    f"ep {epoch+1}/{NUM_EPOCHS} | step={global_step:5d} | "
+                    f"loss={avg_loss:.4f} | lr={lr:.2e} | {tok_s:,.0f} tok/s"
+                )
+                if is_cuda:
+                    alloc    = torch.cuda.memory_allocated(cuda_idx)    / 1024**3
+                    reserved = torch.cuda.memory_reserved(cuda_idx)     / 1024**3
+                    max_res  = torch.cuda.max_memory_reserved(cuda_idx) / 1024**3
+                    status = (
+                        "▲ OK"
+                        if max_res < 75.0 else
+                        "⚠ proche limite"
+                        if max_res < 77.5 else
+                        "🔴 DANGER OOM"
+                    )
+                    print(
+                        f"  GPU mem | alloc={alloc:.1f} | reserved={reserved:.1f} | "
+                        f"max_reserved={max_res:.1f} GiB  {status}"
+                    )
+                last_log         = now
+                tokens_since_log = 0
+                log_loss_sum     = 0.0
+                log_loss_count   = 0
+            if global_step % EVAL_EVERY == 0 and is_main():
+                val_loss = evaluate(model, eval_loader, device)
+                print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}")
+                if val_loss < best_eval:
+                    best_eval = val_loss
+                    save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE)
+                    print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}")
+            if global_step % SAVE_EVERY == 0 and is_main():
+                save_checkpoint(model, optimizer, epoch, global_step, best_eval, STATE_FILE)
+                save_checkpoint(model, optimizer, epoch, global_step, best_eval, MODEL_FILE)
+                print(f"✓ Checkpoint → {MODEL_FILE}")
+        if is_main():
+            save_checkpoint(model, optimizer, epoch + 1, global_step, best_eval, STATE_FILE)
+            ckpt = OUT_DIR / f"model_epoch_{epoch+1:02d}.pt"
+            save_checkpoint(model, optimizer, epoch + 1, global_step, best_eval, ckpt)
+            print(f"✓ Fin epoch {epoch+1}/{NUM_EPOCHS} → {ckpt}")
+    if is_main():
+        save_checkpoint(model, optimizer, NUM_EPOCHS, global_step, best_eval, MODEL_FILE)
+        save_checkpoint(model, optimizer, NUM_EPOCHS, global_step, best_eval, STATE_FILE)
+        total_min = (time.time() - t0) / 60
+        print(f"\nModèle final    → {MODEL_FILE}")
+        print(f"Meilleur modèle → {BEST_MODEL_FILE}")
+        print(f"Temps total     : {total_min:.1f} min | Steps: {global_step}")
+    if is_distributed():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

train_nlp_h100_maxvram_v7.py ADDED Viewed

	@@ -0,0 +1,805 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+train_nlp_h100_maxvram_v7.py  — v3 (fix gated OSCAR → public C4)
+===========================================================
+• Datasets publics seulement (plus de gated error)
+• Toujours ~85 GB de données traitées sur 10 epochs
+"""
+from __future__ import annotations
+import itertools
+import json
+import math
+import os
+import random
+import time
+from collections import OrderedDict
+from contextlib import nullcontext
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Iterator, Optional
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import bitsandbytes as bnb
+    HAS_BNB = True
+except ImportError:
+    HAS_BNB = False
+    print("[warn] bitsandbytes non disponible – quantification 4-bit désactivée")
+try:
+    from flash_attn import flash_attn_func
+    HAS_FLASH = True
+except ImportError:
+    HAS_FLASH = False
+    print("[warn] flash-attn non disponible – fallback F.scaled_dot_product_attention")
+from datasets import load_dataset
+from torch.nn.parallel import DistributedDataParallel as DDP
+from tokenizers import (
+    Tokenizer, decoders, models, normalizers,
+    pre_tokenizers, processors, trainers,
+)
+from transformers import PreTrainedTokenizerFast
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  CHEMINS                                                                    ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+OUT_DIR        = Path("./nlp_1b_h100_opt")
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+TOKENIZER_DIR  = OUT_DIR / "tokenizer_32k"
+CONFIG_FILE    = OUT_DIR / "config.json"
+MODEL_FILE     = OUT_DIR / "model.pt"
+BEST_MODEL_FILE= OUT_DIR / "model_best.pt"
+STATE_FILE     = OUT_DIR / "train_state.pt"
+BASE_CHECKPOINT: Optional[Path] = None
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  HYPERPARAMÈTRES                                                            ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+SEED           = 42
+TARGET_VRAM_GIB= 78.0
+BLOCK_SIZE = 1024
+VOCAB_SIZE = 32_000
+D_MODEL    = 1536
+N_HEADS    = 24
+N_LAYERS   = 24
+D_FF       = 6144
+DROPOUT    = 0.0
+USE_QLORA           = True
+LORA_R              = 64
+LORA_ALPHA          = 128
+LORA_DROPOUT        = 0.05
+LORA_TARGET_MODULES = ["qkv", "proj", "w1", "w2", "w3"]
+NUM_EPOCHS       = 3
+LEARNING_RATE    = 3e-4
+MIN_LR           = 3e-5
+WEIGHT_DECAY     = 0.1
+WARMUP_STEPS     = 500
+BATCH_SIZE       = 28
+GRAD_ACCUM_STEPS = 1
+MAX_GRAD_NORM    = 1.0
+EVAL_EVERY       = 500
+SAVE_EVERY       = 1_000
+DTYPE             = torch.bfloat16
+USE_CHECKPOINTING = False
+USE_COMPILE       = True
+COMPILE_MODE      = "reduce-overhead"
+TRAIN_NUM_WORKERS = 4
+EVAL_NUM_WORKERS  = 2
+PREFETCH_FACTOR   = 2
+TOKENIZER_SAMPLE_DOCS_PER_SOURCE = 15_000
+TOKENIZER_CHAR_LIMIT             = 2_000
+TEXT_CHAR_LIMIT                  = 4_000
+SPECIAL_TOKENS = ["<pad>", "<bos>", "<eos>", "<unk>"]
+PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN = SPECIAL_TOKENS
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DATASETS — PUBLIC + MAX 100 GB (fix gated OSCAR)                          ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+DATA_SOURCES = [
+    # 1. FineWeb (anglais – très haute qualité)
+    {
+        "name": "HuggingFaceFW/fineweb",
+        "config": None,
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 10_000,
+        "train_docs_per_epoch": 1_200_000,   # ~48 GB sur 10 epochs
+        "language_filter": None,
+    },
+    # 2. C4 multilingual → français
+    {
+        "name": "allenai/c4",
+        "config": "multilingual",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 5_000,
+        "train_docs_per_epoch": 400_000,     # ~16 GB sur 10 epochs
+        "language_filter": "fr",
+    },
+    # 3. C4 multilingual → arabe
+    {
+        "name": "allenai/c4",
+        "config": "multilingual",
+        "split": "train",
+        "text_column": "text",
+        "dev_docs": 5_000,
+        "train_docs_per_epoch": 300_000,     # ~12 GB sur 10 epochs
+        "language_filter": "ar",
+    },
+]
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DISTRIBUTED + UTILS                                                        ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def is_distributed() -> bool:
+    return dist.is_available() and dist.is_initialized()
+def get_rank() -> int:
+    return dist.get_rank() if is_distributed() else 0
+def get_world_size() -> int:
+    return dist.get_world_size() if is_distributed() else 1
+def is_main() -> bool:
+    return get_rank() == 0
+def init_distributed() -> Optional[torch.device]:
+    local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if local_rank == -1:
+        return None
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    return torch.device(f"cuda:{local_rank}")
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def get_device(ddp_device: Optional[torch.device] = None) -> torch.device:
+    if ddp_device is not None:
+        return ddp_device
+    if torch.cuda.is_available():
+        return torch.device(f"cuda:{torch.cuda.current_device()}")
+    return torch.device("cpu")
+def current_cuda_index(device: torch.device) -> int:
+    return device.index if device.index is not None else torch.cuda.current_device()
+def autocast_context(device: torch.device):
+    if device.type == "cuda":
+        return torch.autocast("cuda", dtype=DTYPE)
+    return nullcontext()
+def unwrap_model(model: nn.Module) -> nn.Module:
+    m = model.module if isinstance(model, DDP) else model
+    return m._orig_mod if hasattr(m, "_orig_mod") else m
+def count_parameters(model: nn.Module, trainable_only: bool = True) -> int:
+    return sum(p.numel() for p in model.parameters() if not trainable_only or p.requires_grad)
+def normalize_state_dict_keys(sd: dict) -> OrderedDict:
+    out = OrderedDict()
+    for k, v in sd.items():
+        for prefix in ("module._orig_mod.", "_orig_mod.", "module."):
+            if k.startswith(prefix):
+                k = k[len(prefix):]
+                break
+        out[k] = v
+    return out
+def normalize_text(t: str) -> str:
+    return " ".join(t.strip().split())
+def safe_str(x) -> str:
+    return x if isinstance(x, str) else ("" if x is None else str(x))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  DATA LOADING (streaming + language filter)                                 ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def load_hf_stream(repo_id: str, config: str | None = None, split: str = "train"):
+    return load_dataset(repo_id, config, split=split, streaming=True)
+def stream_texts_from_source(source: dict, start: int, count: int, char_limit: int) -> Iterator[str]:
+    ds = load_hf_stream(source["name"], source.get("config"), source.get("split", "train"))
+    col = source["text_column"]
+    for row in itertools.islice(ds, start, start + count):
+        text = normalize_text(safe_str(row.get(col, "")))
+        if len(text) < 20:
+            continue
+        # Filtre langue (pour C4 multilingual)
+        if source.get("language_filter"):
+            if row.get("language") != source["language_filter"]:
+                continue
+        yield text[:char_limit]
+def build_epoch_train_texts(epoch: int) -> list[str]:
+    texts: list[str] = []
+    rng = random.Random(SEED + epoch)
+    for src in DATA_SOURCES:
+        start = src["dev_docs"] + epoch * src["train_docs_per_epoch"]
+        texts.extend(stream_texts_from_source(
+            src, start, src["train_docs_per_epoch"], TEXT_CHAR_LIMIT
+        ))
+    rng.shuffle(texts)
+    return texts
+def build_eval_texts() -> list[str]:
+    texts: list[str] = []
+    for src in DATA_SOURCES:
+        texts.extend(stream_texts_from_source(
+            src, 0, src["dev_docs"], TEXT_CHAR_LIMIT
+        ))
+    return texts
+# ╔═��════════════════════════════════════════════════════════════════════════════╗
+# ║  TOKENIZER                                                                  ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def tokenizer_ready() -> bool:
+    return (TOKENIZER_DIR / "tokenizer.json").exists() and (TOKENIZER_DIR / "tokenizer_config.json").exists()
+def train_tokenizer_once() -> None:
+    TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
+    tok = Tokenizer(models.BPE(unk_token=UNK_TOKEN))
+    tok.normalizer    = normalizers.Sequence([normalizers.NFKC()])
+    tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    tok.decoder       = decoders.ByteLevel()
+    trainer = trainers.BpeTrainer(
+        vocab_size=VOCAB_SIZE, min_frequency=2, show_progress=is_main(),
+        special_tokens=SPECIAL_TOKENS, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+    )
+    tok.train_from_iterator(tokenizer_training_iterator(), trainer=trainer)
+    bos_id, eos_id = tok.token_to_id(BOS_TOKEN), tok.token_to_id(EOS_TOKEN)
+    tok.post_processor = processors.TemplateProcessing(
+        single=f"{BOS_TOKEN} $A {EOS_TOKEN}",
+        pair=f"{BOS_TOKEN} $A {EOS_TOKEN} $B:1 {EOS_TOKEN}:1",
+        special_tokens=[(BOS_TOKEN, bos_id), (EOS_TOKEN, eos_id)],
+    )
+    tok.save(str(TOKENIZER_DIR / "tokenizer.json"))
+    fast = PreTrainedTokenizerFast(
+        tokenizer_file=str(TOKENIZER_DIR / "tokenizer.json"),
+        bos_token=BOS_TOKEN, eos_token=EOS_TOKEN, unk_token=UNK_TOKEN, pad_token=PAD_TOKEN,
+    )
+    fast.save_pretrained(str(TOKENIZER_DIR))
+def tokenizer_training_iterator() -> Iterator[str]:
+    for src in DATA_SOURCES:
+        yield from stream_texts_from_source(src, 0, TOKENIZER_SAMPLE_DOCS_PER_SOURCE, TOKENIZER_CHAR_LIMIT)
+def train_or_load_tokenizer() -> PreTrainedTokenizerFast:
+    TOKENIZER_DIR.mkdir(parents=True, exist_ok=True)
+    if not tokenizer_ready():
+        if is_distributed():
+            if is_main():
+                print("Entraînement tokenizer 32k…")
+                train_tokenizer_once()
+            dist.barrier()
+        else:
+            print("Entraînement tokenizer 32k…")
+            train_tokenizer_once()
+    return PreTrainedTokenizerFast.from_pretrained(str(TOKENIZER_DIR))
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  MODÈLE + QLORA + OPTIMIZER + CHECKPOINT + EVAL (inchangés)               ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+# (Tout le reste du code est identique à la v2 que je t’ai donnée précédemment)
+# Je le garde complet pour que tu puisses copier-coller directement.
+@dataclass
+class GPTConfig:
+    vocab_size: int   = VOCAB_SIZE
+    block_size: int   = BLOCK_SIZE
+    d_model:    int   = D_MODEL
+    n_heads:    int   = N_HEADS
+    n_layers:   int   = N_LAYERS
+    d_ff:       int   = D_FF
+    dropout:    float = DROPOUT
+    use_checkpointing: bool = USE_CHECKPOINTING
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps    = eps
+    def forward(self, x):
+        return self.weight * x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, base: int = 10_000, max_seq: int = 4_096):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t     = torch.arange(max_seq).float()
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cache", torch.repeat_interleave(freqs.cos(), 2, dim=-1), persistent=False)
+        self.register_buffer("sin_cache", torch.repeat_interleave(freqs.sin(), 2, dim=-1), persistent=False)
+    def forward(self, seq_len: int, dtype: torch.dtype):
+        return self.cos_cache[:seq_len].to(dtype), self.sin_cache[:seq_len].to(dtype)
+def rotate_half(x):
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    return torch.stack((-x2, x1), dim=-1).flatten(-2)
+def apply_rope(x, cos, sin):
+    return x * cos.unsqueeze(0).unsqueeze(0) + rotate_half(x) * sin.unsqueeze(0).unsqueeze(0)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        assert cfg.d_model % cfg.n_heads == 0
+        self.n_heads   = cfg.n_heads
+        self.head_dim  = cfg.d_model // cfg.n_heads
+        self.qkv       = nn.Linear(cfg.d_model, 3 * cfg.d_model, bias=False)
+        self.proj      = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout_p = cfg.dropout
+        self.rope      = RotaryEmbedding(self.head_dim)
+    def forward(self, x):
+        b, t, c = x.shape
+        q, k, v = self.qkv(x).split(c, dim=-1)
+        q = q.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        k = k.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        v = v.view(b, t, self.n_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rope(t, x.dtype)
+        q, k = apply_rope(q, cos, sin), apply_rope(k, cos, sin)
+        if HAS_FLASH:
+            q = q.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
+            y = flash_attn_func(q, k, v, dropout_p=self.dropout_p if self.training else 0.0, causal=True)
+            y = y.reshape(b, t, c)
+        else:
+            y = F.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout_p if self.training else 0.0, is_causal=True)
+            y = y.transpose(1, 2).contiguous().view(b, t, c)
+        return self.proj(y)
+class SwiGLU(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.w1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w2 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.w3 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+    def forward(self, x):
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class Block(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.ln1  = RMSNorm(cfg.d_model)
+        self.attn = CausalSelfAttention(cfg)
+        self.ln2  = RMSNorm(cfg.d_model)
+        self.ff   = SwiGLU(cfg)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.ff(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, cfg: GPTConfig):
+        super().__init__()
+        self.cfg     = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.blocks  = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])
+        self.ln_f    = RMSNorm(cfg.d_model)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        self.lm_head.weight = self.tok_emb.weight
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, (nn.Linear, nn.Embedding)):
+            nn.init.normal_(m.weight, 0.0, 0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.zeros_(m.bias)
+    def forward(self, input_ids, labels=None):
+        x = self.tok_emb(input_ids)
+        for block in self.blocks:
+            if self.cfg.use_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, use_reentrant=False)
+            else:
+                x = block(x)
+        logits = self.lm_head(self.ln_f(x))
+        loss   = None
+        if labels is not None:
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), labels.reshape(-1), ignore_index=-100)
+        return logits, loss
+class LoRALinear(nn.Module):
+    def __init__(self, base_layer: nn.Linear, r: int = LORA_R, alpha: int = LORA_ALPHA, dropout: float = LORA_DROPOUT):
+        super().__init__()
+        self.base  = base_layer
+        self.r     = r
+        self.scale = alpha / r
+        in_f, out_f = base_layer.in_features, base_layer.out_features
+        try:
+            dev = next(base_layer.parameters()).device
+        except StopIteration:
+            dev = torch.device("cpu")
+        self.lora_A = nn.Linear(in_f, r, bias=False, device=dev)
+        self.lora_B = nn.Linear(r, out_f, bias=False, device=dev)
+        self.drop   = nn.Dropout(dropout)
+        nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_B.weight)
+        for p in self.base.parameters():
+            p.requires_grad = False
+    def forward(self, x):
+        return self.base(x) + self.lora_B(self.lora_A(self.drop(x))) * self.scale
+def apply_qlora(model: GPT, device: torch.device) -> GPT:
+    if not USE_QLORA:
+        return model
+    replaced = 0
+    targets = []
+    for name, module in model.named_modules():
+        parts = name.split(".")
+        if parts[-1] in LORA_TARGET_MODULES and isinstance(module, nn.Linear):
+            targets.append((name, module))
+    for name, module in targets:
+        parts  = name.split(".")
+        parent = model
+        for part in parts[:-1]:
+            parent = getattr(parent, part)
+        lora_layer = LoRALinear(module)
+        setattr(parent, parts[-1], lora_layer)
+        replaced += 1
+    if is_main():
+        print(f"QLoRA : {replaced} couches remplacées (device={device}, NF4={HAS_BNB})")
+    return model
+def freeze_base_weights(model: GPT) -> None:
+    for name, p in model.named_parameters():
+        p.requires_grad = ("lora_A" in name or "lora_B" in name)
+def build_optimizer(model: nn.Module) -> torch.optim.Optimizer:
+    decay, no_decay = [], []
+    for name, p in unwrap_model(model).named_parameters():
+        if not p.requires_grad: continue
+        (decay if p.ndim >= 2 and "weight" in name else no_decay).append(p)
+    groups = [
+        {"params": decay,    "weight_decay": WEIGHT_DECAY},
+        {"params": no_decay, "weight_decay": 0.0},
+    ]
+    if HAS_BNB:
+        return bnb.optim.PagedAdamW8bit(groups, lr=LEARNING_RATE, betas=(0.9, 0.95), eps=1e-8)
+    return torch.optim.AdamW(groups, lr=LEARNING_RATE, betas=(0.9, 0.95), eps=1e-8, fused=torch.cuda.is_available())
+def cosine_lr(step: int, total_steps: int) -> float:
+    if step < WARMUP_STEPS:
+        return LEARNING_RATE * step / max(1, WARMUP_STEPS)
+    p = min(1.0, (step - WARMUP_STEPS) / max(1, total_steps - WARMUP_STEPS))
+    return MIN_LR + 0.5 * (LEARNING_RATE - MIN_LR) * (1 + math.cos(math.pi * p))
+def save_checkpoint(model, optimizer, epoch, step, best_loss, path):
+    raw = unwrap_model(model)
+    torch.save({
+        "model": normalize_state_dict_keys(raw.state_dict()),
+        "optimizer": optimizer.state_dict(),
+        "epoch": epoch, "step": step, "best_loss": best_loss,
+        "config": asdict(raw.cfg),
+    }, path)
+def maybe_load_base_checkpoint(model, device):
+    if BASE_CHECKPOINT is None or not Path(BASE_CHECKPOINT).exists():
+        return
+    ckpt = torch.load(BASE_CHECKPOINT, map_location=device)
+    unwrap_model(model).load_state_dict(normalize_state_dict_keys(ckpt["model"]), strict=False)
+def load_resume_checkpoint(model, optimizer, path, device):
+    ckpt = torch.load(path, map_location=device)
+    unwrap_model(model).load_state_dict(normalize_state_dict_keys(ckpt["model"]), strict=True)
+    try:
+        optimizer.load_state_dict(ckpt["optimizer"])
+    except Exception as e:
+        print(f"[warn] Optimizer state non repris: {e}")
+    return int(ckpt.get("epoch", 0)), int(ckpt.get("step", 0)), float(ckpt.get("best_loss", 1e9))
+@torch.no_grad()
+def evaluate(model, loader, device, max_batches=200) -> float:
+    model.eval()
+    losses = []
+    for i, batch in enumerate(loader):
+        if i >= max_batches: break
+        inp = batch["input_ids"].to(device, non_blocking=True)
+        lbl = batch["labels"].to(device, non_blocking=True)
+        with autocast_context(device):
+            _, loss = model(inp, lbl)
+        losses.append(loss.item())
+    model.train()
+    return sum(losses) / max(1, len(losses))
+def make_loader(dataset, batch_size, num_workers, is_cuda):
+    kwargs = dict(batch_size=batch_size, num_workers=num_workers, pin_memory=is_cuda)
+    if num_workers > 0:
+        kwargs["persistent_workers"] = True
+        kwargs["prefetch_factor"]    = PREFETCH_FACTOR
+    return torch.utils.data.DataLoader(dataset, **kwargs)
+class PackedTextList(torch.utils.data.IterableDataset):
+    def __init__(self, texts, tokenizer, block_size, epoch_seed=0):
+        super().__init__()
+        self.texts      = texts
+        self.tokenizer  = tokenizer
+        self.block_size = block_size
+        self.epoch_seed = epoch_seed
+    def __iter__(self):
+        worker = torch.utils.data.get_worker_info()
+        rank, ws = get_rank(), get_world_size()
+        if worker is None:
+            shard_mod, shard_id = ws, rank
+        else:
+            shard_mod = worker.num_workers * ws
+            shard_id  = rank * worker.num_workers + worker.id
+        rng = random.Random(self.epoch_seed)
+        indices = list(range(len(self.texts)))
+        rng.shuffle(indices)
+        bos, eos = self.tokenizer.bos_token_id, self.tokenizer.eos_token_id
+        buf: list[int] = []
+        for li, ti in enumerate(indices):
+            if li % shard_mod != shard_id:
+                continue
+            ids = self.tokenizer.encode(self.texts[ti], add_special_tokens=False)
+            if not ids: continue
+            buf.extend([bos] + ids + [eos])
+            while len(buf) >= self.block_size + 1:
+                chunk = buf[:self.block_size + 1]
+                buf = buf[self.block_size + 1:]
+                yield {
+                    "input_ids": torch.tensor(chunk[:-1], dtype=torch.long),
+                    "labels":    torch.tensor(chunk[1:],  dtype=torch.long),
+                }
+# ╔══════════════════════════════════════════════════════════════════════════════╗
+# ║  MAIN                                                                       ║
+# ╚══════════════════════════════════════════════════════════════════════════════╝
+def main() -> None:
+    ddp_device = init_distributed()
+    set_seed(SEED + get_rank())
+    device  = get_device(ddp_device)
+    is_cuda = device.type == "cuda"
+    cuda_idx = None
+    if is_cuda:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32       = True
+        torch.set_float32_matmul_precision("high")
+        cuda_idx = current_cuda_index(device)
+        _, total = torch.cuda.mem_get_info(cuda_idx)
+        vram_fraction = min(TARGET_VRAM_GIB * (1024**3) / total, 0.999)
+        torch.cuda.memory.set_per_process_memory_fraction(vram_fraction, device=cuda_idx)
+    if is_main():
+        print("=" * 72)
+        print(" GPT ~1B | H100 80 Go | QLoRA + BF16 + TF32 | MAX 100 GB (public)")
+        print("=" * 72)
+        print(f"Device  : {device} | World: {get_world_size()} GPU(s)")
+        print(f"Flash-2 : {HAS_FLASH} | BNB 4-bit: {HAS_BNB} | QLoRA: {USE_QLORA}")
+        print(f"Grad ckpt: {USE_CHECKPOINTING} | Compile: {USE_COMPILE} ({COMPILE_MODE})")
+        if is_cuda:
+            free, total = torch.cuda.mem_get_info(cuda_idx)
+            print(f"GPU     : {torch.cuda.get_device_name(cuda_idx)}")
+            print(f"VRAM    : {total/1024**3:.1f} GiB | libre: {free/1024**3:.1f} GiB")
+    tokenizer = train_or_load_tokenizer()
+    cfg       = GPTConfig(vocab_size=len(tokenizer))
+    if is_main():
+        CONFIG_FILE.write_text(json.dumps(asdict(cfg), indent=2, ensure_ascii=False), encoding="utf-8")
+    model = GPT(cfg).to(device)
+    if USE_QLORA:
+        model = apply_qlora(model, device)
+        freeze_base_weights(model)
+    maybe_load_base_checkpoint(model, device)
+    if USE_COMPILE and not USE_CHECKPOINTING and hasattr(torch, "compile"):
+        try:
+            model = torch.compile(model, mode=COMPILE_MODE)
+            if is_main():
+                print(f"torch.compile activé ({COMPILE_MODE})")
+        except Exception as e:
+            if is_main():
+                print(f"[warn] torch.compile échoué ({e}) — poursuite sans compile")
+    if is_distributed():
+        model = DDP(model, device_ids=[device.index])
+    optimizer = build_optimizer(model)
+    eval_texts  = build_eval_texts()
+    eval_ds     = PackedTextList(eval_texts, tokenizer, cfg.block_size, SEED + 999)
+    eval_loader = make_loader(eval_ds, BATCH_SIZE, EVAL_NUM_WORKERS, is_cuda)
+    init_texts         = build_epoch_train_texts(0)
+    steps_per_epoch    = max(1, len(init_texts) // BATCH_SIZE)
+    total_steps_est    = steps_per_epoch * NUM_EPOCHS
+    start_epoch, start_step, best_eval = 0, 0, 1e9
+    if STATE_FILE.exists():
+        try:
+            if is_main(): print(f"Reprise depuis {STATE_FILE}")
+            start_epoch, start_step, best_eval = load_resume_checkpoint(model, optimizer, STATE_FILE, device)
+        except Exception as e:
+            if is_main():
+                bad = STATE_FILE.with_suffix(".corrupt.pt")
+                print(f"[warn] Checkpoint illisible: {e}")
+                try: STATE_FILE.rename(bad)
+                except: pass
+            start_epoch, start_step, best_eval = 0, 0, 1e9
+    if is_main():
+        raw     = unwrap_model(model)
+        n_total = count_parameters(raw, False)
+        n_train = count_parameters(raw, True)
+        print(f"Paramètres totaux    : {n_total/1e9:.3f}B")
+        print(f"Paramètres entraînés : {n_train/1e6:.1f}M ({100*n_train/max(1,n_total):.2f}%)")
+        print(f"Batch size   : {BATCH_SIZE} | Grad accum: {GRAD_ACCUM_STEPS} | Effective: {BATCH_SIZE*GRAD_ACCUM_STEPS}")
+        print(f"Steps estimés: {total_steps_est} | Eval texts: {len(eval_texts)}")
+        print("\n── Conseil VRAM ────────────────────────────────────────────────")
+        print("  Surveille max_reserved à step 50.")
+        print("  Si OOM → baisse BATCH_SIZE ou active USE_CHECKPOINTING=True")
+        print("────────────────────────────────────────────────────────────────")
+    model.train()
+    optimizer.zero_grad(set_to_none=True)
+    global_step      = start_step
+    t0               = time.time()
+    log_loss_sum     = 0.0
+    log_loss_count   = 0
+    tokens_since_log = 0
+    last_log         = time.time()
+    if is_cuda:
+        torch.cuda.reset_peak_memory_stats(cuda_idx)
+    for epoch in range(start_epoch, NUM_EPOCHS):
+        if is_main():
+            print(f"\n{'='*20} Epoch {epoch+1}/{NUM_EPOCHS} {'='*20}")
+        train_texts  = build_epoch_train_texts(epoch)
+        train_ds     = PackedTextList(train_texts, tokenizer, cfg.block_size, SEED + epoch)
+        train_loader = make_loader(train_ds, BATCH_SIZE, TRAIN_NUM_WORKERS, is_cuda)
+        for micro_step, batch in enumerate(train_loader):
+            inp = batch["input_ids"].to(device, non_blocking=True)
+            lbl = batch["labels"].to(device, non_blocking=True)
+            with autocast_context(device):
+                _, loss = model(inp, lbl)
+            (loss / GRAD_ACCUM_STEPS).backward()
+            log_loss_sum     += loss.item()
+            log_loss_count   += 1
+            tokens_since_log += inp.numel()
+            if (micro_step + 1) % GRAD_ACCUM_STEPS != 0:
+                continue
+            lr = cosine_lr(global_step, total_steps_est)
+            for group in optimizer.param_groups:
+                group["lr"] = lr
+            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
+            optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+            global_step += 1
+            if global_step % 50 == 0 and is_main():
+                now      = time.time()
+                elapsed  = max(1e-6, now - last_log)
+                tok_s    = tokens_since_log / elapsed
+                avg_loss = log_loss_sum / max(1, log_loss_count)
+                print(f"ep {epoch+1}/{NUM_EPOCHS} | step={global_step:5d} | loss={avg_loss:.4f} | lr={lr:.2e} | {tok_s:,.0f} tok/s")
+                if is_cuda:
+                    alloc = torch.cuda.memory_allocated(cuda_idx) / 1024**3
+                    reserved = torch.cuda.memory_reserved(cuda_idx) / 1024**3
+                    max_alloc = torch.cuda.max_memory_allocated(cuda_idx) / 1024**3
+                    max_res = torch.cuda.max_memory_reserved(cuda_idx) / 1024**3
+                    print(f"GPU mem | alloc={alloc:.2f} | reserved={reserved:.2f} | max_reserved={max_res:.2f} GiB")
+                last_log = now
+                tokens_since_log = 0
+                log_loss_sum = 0.0
+                log_loss_count = 0
+            if global_step % EVAL_EVERY == 0 and is_main():
+                val_loss = evaluate(model, eval_loader, device)
+                print(f"[eval] step {global_step:5d} | val_loss={val_loss:.4f}")
+                if val_loss < best_eval:
+                    best_eval = val_loss
+                    save_checkpoint(model, optimizer, epoch, global_step, best_eval, BEST_MODEL_FILE)
+                    print(f"✓ Meilleur modèle → {BEST_MODEL_FILE}")
+            if global_step % SAVE_EVERY == 0 and is_main():
+                save_checkpoint(model, optimizer, epoch, global_step, best_eval, STATE_FILE)
+                save_checkpoint(model, optimizer, epoch, global_step, best_eval, MODEL_FILE)
+                print(f"✓ Checkpoint → {MODEL_FILE}")
+        if is_main():
+            save_checkpoint(model, optimizer, epoch + 1, global_step, best_eval, STATE_FILE)
+            ckpt = OUT_DIR / f"model_epoch_{epoch+1:02d}.pt"
+            save_checkpoint(model, optimizer, epoch + 1, global_step, best_eval, ckpt)
+            print(f"✓ Fin epoch {epoch+1}/{NUM_EPOCHS} → {ckpt}")
+    if is_main():
+        save_checkpoint(model, optimizer, NUM_EPOCHS, global_step, best_eval, MODEL_FILE)
+        save_checkpoint(model, optimizer, NUM_EPOCHS, global_step, best_eval, STATE_FILE)
+        total_min = (time.time() - t0) / 60
+        print(f"\nModèle final    → {MODEL_FILE}")
+        print(f"Meilleur modèle → {BEST_MODEL_FILE}")
+        print(f"Temps total     : {total_min:.1f} min | Steps: {global_step}")
+    if is_distributed():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

upload.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import argparse
+import os
+import re
+import time
+from pathlib import Path
+from huggingface_hub import HfApi
+IGNORE_NAMES = {
+    ".git",
+    "__pycache__",
+    ".ipynb_checkpoints",
+    ".DS_Store",
+}
+IGNORE_SUFFIXES = {
+    ".tmp",
+    ".lock",
+    ".swp",
+    ".swx",
+    ".part",
+}
+def sanitize_repo_name(name: str) -> str:
+    name = name.strip().replace(" ", "-")
+    name = re.sub(r"[^A-Za-z0-9._-]+", "-", name)
+    name = re.sub(r"-{2,}", "-", name).strip("-")
+    return name[:96] or "model"
+def should_ignore(path: Path) -> bool:
+    if any(part in IGNORE_NAMES for part in path.parts):
+        return True
+    if path.suffix.lower() in IGNORE_SUFFIXES:
+        return True
+    return False
+def folder_stats(folder: Path):
+    file_count = 0
+    total_size = 0
+    for p in folder.rglob("*"):
+        if not p.is_file():
+            continue
+        if should_ignore(p):
+            continue
+        try:
+            total_size += p.stat().st_size
+            file_count += 1
+        except FileNotFoundError:
+            continue
+    return file_count, total_size
+def format_bytes(num_bytes: int) -> str:
+    value = float(num_bytes)
+    units = ["B", "KB", "MB", "GB", "TB"]
+    for unit in units:
+        if value < 1024 or unit == units[-1]:
+            return f"{value:.2f} {unit}"
+        value /= 1024
+def format_speed(bytes_per_sec: float) -> str:
+    return f"{format_bytes(bytes_per_sec)}/s"
+def main():
+    parser = argparse.ArgumentParser(
+        description="Upload one-shot d'un dossier modèle vers Hugging Face Hub."
+    )
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        required=True,
+        help="Chemin du dossier modèle à uploader",
+    )
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default="Medyassino",
+        help="Namespace ou username Hugging Face",
+    )
+    parser.add_argument(
+        "--repo_name",
+        type=str,
+        default=None,
+        help="Nom du repo distant. Par défaut: nom du dossier modèle",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Créer le repo en privé. Sans ce flag, il sera public",
+    )
+    parser.add_argument(
+        "--large",
+        action="store_true",
+        help="Utiliser upload_large_folder pour les gros dossiers",
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        default=None,
+        help="Token HF. Sinon utilise HF_TOKEN ou le login local",
+    )
+    parser.add_argument(
+        "--commit_message",
+        type=str,
+        default="Upload model from local folder",
+        help="Message de commit pour upload_folder",
+    )
+    args = parser.parse_args()
+    model_dir = Path(args.model_dir).expanduser().resolve()
+    if not model_dir.exists() or not model_dir.is_dir():
+        raise RuntimeError(f"Dossier modèle introuvable: {model_dir}")
+    repo_name = sanitize_repo_name(args.repo_name or model_dir.name)
+    repo_id = f"{args.namespace}/{repo_name}"
+    token = args.token or os.environ.get("HF_TOKEN")
+    api = HfApi(token=token)
+    try:
+        who = api.whoami()
+    except Exception as e:
+        raise RuntimeError(
+            "Authentification Hugging Face impossible. "
+            "Fais `hf auth login`, ou passe `--token`, ou définis HF_TOKEN."
+        ) from e
+    file_count, total_size = folder_stats(model_dir)
+    print(f"Authentifié comme: {who.get('name') or who.get('fullname') or who}")
+    print(f"Upload de: {model_dir}")
+    print(f"Repo cible: {repo_id}")
+    print(f"Visibilité : {'privé' if args.private else 'public'}")
+    print(f"Fichiers détectés: {file_count}")
+    print(f"Taille totale   : {format_bytes(total_size)}")
+    api.create_repo(
+        repo_id=repo_id,
+        repo_type="model",
+        private=args.private,
+        exist_ok=True,
+    )
+    start_time = time.perf_counter()
+    if args.large:
+        api.upload_large_folder(
+            repo_id=repo_id,
+            repo_type="model",
+            folder_path=str(model_dir),
+        )
+    else:
+        api.upload_folder(
+            repo_id=repo_id,
+            repo_type="model",
+            folder_path=str(model_dir),
+            commit_message=args.commit_message,
+            ignore_patterns=[
+                "**/.git/**",
+                "**/__pycache__/**",
+                "**/.ipynb_checkpoints/**",
+                "**/*.tmp",
+                "**/*.lock",
+                "**/*.swp",
+                "**/*.part",
+            ],
+        )
+    elapsed = time.perf_counter() - start_time
+    avg_speed = (total_size / elapsed) if elapsed > 0 else 0.0
+    print()
+    print(f"Upload OK -> https://huggingface.co/{repo_id}")
+    print(f"Durée totale    : {elapsed:.2f} s")
+    print(f"Vitesse moyenne : {format_speed(avg_speed)}")
+if __name__ == "__main__":
+    main()

wikipedia_ar_h100/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1024,
+  "n_heads": 16,
+  "n_layers": 24,
+  "d_ff": 4096,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

wikipedia_ar_h100/tokenizer_32k/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

wikipedia_ar_h100/tokenizer_32k/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "eos_token": "<eos>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

wikipedia_ar_h100/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09c3c854a6aaa593570f779f0a4c7281db9cc027a3897437dfe3a4cb1075b92e
+size 2322908797

wikipedia_ar_h100_agri_30gb/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1024,
+  "n_heads": 16,
+  "n_layers": 24,
+  "d_ff": 4096,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

wikipedia_ar_h100_codealpaca/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1024,
+  "n_heads": 16,
+  "n_layers": 24,
+  "d_ff": 4096,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

wikipedia_ar_h100_env_fr_ar_77gb/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1024,
+  "n_heads": 16,
+  "n_layers": 24,
+  "d_ff": 4096,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

wikipedia_ar_h100_env_fr_ar_77gb/model_epoch_03.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c04ad704abccb3f01a61fbec3d09442fd9edc3b8d656addc98ba6b25e01ac28
+size 5225864649

wikipedia_ar_h100_multicode/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1024,
+  "n_heads": 16,
+  "n_layers": 24,
+  "d_ff": 4096,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

wikipedia_ar_h100_multicode/train_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8234246d0aa1dbe4ced74d07266bebd381919bd0aada72b8aafa11621f0846bc
+size 5225862591

wikipedia_ar_h100_multicode_10x2000/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 32000,
+  "block_size": 1024,
+  "d_model": 1024,
+  "n_heads": 16,
+  "n_layers": 24,
+  "d_ff": 4096,
+  "dropout": 0.0,
+  "use_checkpointing": false
+}

wikipedia_ar_h100_multicode_10x2000/model_round_06.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7c64f58ee916183c6f04c84c1e9fbd8b92991375ade718fe438385cea36c090
+size 5225864649