Medyassino commited on May 2

Commit

0b5f348

verified ·

1 Parent(s): f86198c

Add files using upload-large-folder tool

Browse files

Files changed (22) hide show

modeleAIRAG/qa_terminal.py +184 -0
modeleAIRAG/test_rag_doc_interne_100m.py +309 -0
modeleAIRAG/train1.py +780 -0
modeleAIRAG/train2.py +921 -0
modeleAIRAG/train3_200m.py +922 -0
rag_boolq_400m/checkpoints/training_info.json +7 -0
rag_boolq_400m/local_finetuned/README.md +5 -0
rag_boolq_400m/local_finetuned/config.json +50 -0
rag_boolq_400m/local_finetuned/tokenizer/tokenizer.json +0 -0
rag_boolq_400m/local_finetuned/tokenizer/tokenizer_config.json +16 -0
rag_boolq_400m/local_finetuned/tokenizer/training_info.json +7 -0
rag_boolq_400m/local_finetuned/training_info.json +7 -0
rag_boolq_400m/local_finetuned/training_summary.json +50 -0
rag_boolq_400m/models/custom_bpe_v6_2.json +0 -0
rag_boolq_400m/models/tokenizer_fast/tokenizer.json +0 -0
rag_boolq_400m/models/tokenizer_fast/tokenizer_config.json +16 -0
rag_boolq_400m/models/tokenizer_fast/training_info.json +7 -0
rag_boolq_400m/models/training_info.json +7 -0
rag_boolq_400m/summary_v6_2.json +28 -0
rag_v6_2_400m_domains/summary_v6_2.json +33 -0
security/cyber_unified.py +1370 -0
security/sec.py +338 -0

modeleAIRAG/qa_terminal.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import argparse
+import json
+from pathlib import Path
+from test_rag_doc_interne_100m import load_model, encode_texts, search
+DEFAULT_CORPUS = [
+    "ARTICLE 12 - Les congés payés sont acquis à raison de 2,5 jours par mois travaillé.",
+    "Procédure de validation des notes de frais : transmettre via le portail RH avant le 5 du mois.",
+    "La politique RGPD impose un délai de 72h pour notifier une violation de données.",
+    "Le télétravail est autorisé jusqu'à 3 jours par semaine sur accord du manager.",
+    "Toute facture fournisseur doit être validée par le responsable budget avant paiement.",
+    "Formation obligatoire sécurité incendie : 1 fois par an, traçabilité dans le SIRH.",
+    "L'accord d'entreprise du 15/03/2024 fixe le taux de prime annuelle à 8% du salaire brut.",
+]
+def load_corpus(path):
+    """
+    Formats acceptés :
+    - .txt  : un passage par ligne
+    - .jsonl: champs possibles: positive, text, content, passage
+    """
+    if path is None:
+        return DEFAULT_CORPUS
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"Corpus introuvable : {path}")
+    corpus = []
+    if path.suffix.lower() == ".txt":
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    corpus.append(line)
+    elif path.suffix.lower() == ".jsonl":
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                obj = json.loads(line)
+                text = (
+                    obj.get("positive")
+                    or obj.get("text")
+                    or obj.get("content")
+                    or obj.get("passage")
+                )
+                if text:
+                    corpus.append(text.strip())
+    else:
+        raise ValueError("Format corpus non supporté. Utilise .txt ou .jsonl")
+    if not corpus:
+        raise ValueError("Corpus vide.")
+    return corpus
+def print_results(results, threshold, margin):
+    top1 = results[0]
+    top2_score = results[1]["score"] if len(results) > 1 else 0.0
+    diff = top1["score"] - top2_score
+    print("\n================ RÉPONSE ================")
+    if top1["score"] < threshold:
+        print("Aucun passage suffisamment pertinent trouvé.")
+        print(f"Score Top 1 : {top1['score']:.4f}")
+    else:
+        if diff < margin:
+            print("Résultat possible, mais incertain : Top 1 et Top 2 sont proches.")
+            print(f"Écart Top1 - Top2 : {diff:.4f}")
+        print(f"\nMeilleur passage | score={top1['score']:.4f}")
+        print(top1["text"])
+    print("\n================ TOP RÉSULTATS ================")
+    for i, r in enumerate(results, start=1):
+        print(f"\nTop {i} | score={r['score']:.4f}")
+        print(r["text"])
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="./checkpoints_rag_doc_100m",
+        help="Dossier du checkpoint.",
+    )
+    parser.add_argument(
+        "--corpus",
+        type=str,
+        default=None,
+        help="Corpus .txt ou .jsonl. Si absent, utilise le corpus de test.",
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=5,
+        help="Nombre de passages à retourner.",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.45,
+        help="Score minimal pour accepter une réponse.",
+    )
+    parser.add_argument(
+        "--margin",
+        type=float,
+        default=0.03,
+        help="Écart minimal conseillé entre Top 1 et Top 2.",
+    )
+    args = parser.parse_args()
+    model, tokenizer, cfg, device = load_model(args.save_dir)
+    print(f"[INFO] Modèle chargé depuis : {args.save_dir}")
+    print(f"[INFO] Device : {device}")
+    corpus = load_corpus(args.corpus)
+    print(f"[INFO] Corpus chargé : {len(corpus)} passages")
+    print("[INFO] Encodage du corpus...")
+    corpus_embeddings = encode_texts(
+        model=model,
+        tokenizer=tokenizer,
+        texts=corpus,
+        device=device,
+        max_seq_len=cfg.max_seq_len,
+    )
+    print("\n==============================================")
+    print(" QA TERMINAL RAG")
+    print(" Tape ta question puis Entrée.")
+    print(" Commandes : exit, quit, q")
+    print("==============================================")
+    while True:
+        query = input("\nQuestion > ").strip()
+        if query.lower() in {"exit", "quit", "q"}:
+            print("Fin du QA.")
+            break
+        if not query:
+            continue
+        results = search(
+            query=query,
+            corpus=corpus,
+            corpus_embeddings=corpus_embeddings,
+            model=model,
+            tokenizer=tokenizer,
+            cfg=cfg,
+            device=device,
+            top_k=args.top_k,
+        )
+        print_results(results, args.threshold, args.margin)
+if __name__ == "__main__":
+    main()

modeleAIRAG/test_rag_doc_interne_100m.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+# =============================================================================
+# CONFIG identique au modèle entraîné
+# =============================================================================
+@dataclass
+class Config:
+    vocab_size: int = 32000
+    hidden_size: int = 768
+    num_hidden_layers: int = 12
+    num_attention_heads: int = 12
+    intermediate_size: int = 3072
+    max_position_embeddings: int = 512
+    hidden_dropout_prob: float = 0.1
+    attention_probs_dropout_prob: float = 0.1
+    layer_norm_eps: float = 1e-12
+    embedding_dim: int = 768
+    use_layer_scale: bool = True
+    layer_scale_init: float = 1e-5
+    use_grad_checkpointing: bool = False
+    max_seq_len: int = 384
+    save_dir: str = "./checkpoints_rag_doc_100m"
+# =============================================================================
+# ARCHITECTURE
+# =============================================================================
+class TransformerEncoderBlock(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.num_heads = cfg.num_attention_heads
+        self.head_dim = cfg.hidden_size // cfg.num_attention_heads
+        self.ln1 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.qkv = nn.Linear(cfg.hidden_size, 3 * cfg.hidden_size)
+        self.proj = nn.Linear(cfg.hidden_size, cfg.hidden_size)
+        self.attn_drop_p = cfg.attention_probs_dropout_prob
+        self.ln2 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.intermediate_size),
+            nn.GELU(),
+            nn.Linear(cfg.intermediate_size, cfg.hidden_size),
+            nn.Dropout(cfg.hidden_dropout_prob),
+        )
+        self.resid_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.use_ls = cfg.use_layer_scale
+        if cfg.use_layer_scale:
+            self.gamma1 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+            self.gamma2 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+    def forward(self, x, attn_mask):
+        B, T, C = x.shape
+        h = self.ln1(x)
+        qkv = self.qkv(h).view(B, T, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
+        kpm = attn_mask[:, None, None, :].bool()
+        a = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=kpm,
+            dropout_p=0.0,
+            is_causal=False,
+        )
+        a = a.transpose(1, 2).contiguous().view(B, T, C)
+        a = self.resid_drop(self.proj(a))
+        if self.use_ls:
+            a = a * self.gamma1
+        x = x + a
+        m = self.mlp(self.ln2(x))
+        if self.use_ls:
+            m = m * self.gamma2
+        return x + m
+class TextEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(
+            cfg.vocab_size,
+            cfg.hidden_size,
+            padding_idx=0,
+        )
+        self.pos_emb = nn.Embedding(
+            cfg.max_position_embeddings,
+            cfg.hidden_size,
+        )
+        self.emb_ln = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.emb_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.blocks = nn.ModuleList(
+            [TransformerEncoderBlock(cfg) for _ in range(cfg.num_hidden_layers)]
+        )
+        self.ln_f = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.proj_head = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.hidden_size),
+            nn.Tanh(),
+            nn.Linear(cfg.hidden_size, cfg.embedding_dim),
+        )
+    def encode_backbone(self, ids, mask):
+        B, T = ids.shape
+        pos = torch.arange(T, device=ids.device).unsqueeze(0).expand(B, T)
+        x = self.tok_emb(ids) + self.pos_emb(pos)
+        x = self.emb_drop(self.emb_ln(x))
+        for blk in self.blocks:
+            x = blk(x, mask)
+        return self.ln_f(x)
+    def forward(self, ids, mask):
+        x = self.encode_backbone(ids, mask)
+        m = mask.unsqueeze(-1).float()
+        pooled = (x * m).sum(dim=1) / m.sum(dim=1).clamp(min=1e-6)
+        emb = self.proj_head(pooled)
+        return F.normalize(emb, p=2, dim=-1)
+# =============================================================================
+# FONCTIONS TEST
+# =============================================================================
+@torch.no_grad()
+def encode_texts(model, tokenizer, texts, device, max_seq_len=384, batch_size=32):
+    model.eval()
+    all_embeddings = []
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i + batch_size]
+        enc = tokenizer(
+            batch,
+            padding=True,
+            truncation=True,
+            max_length=max_seq_len,
+            return_tensors="pt",
+        ).to(device)
+        with torch.autocast(
+            device_type="cuda",
+            dtype=torch.bfloat16,
+            enabled=torch.cuda.is_available(),
+        ):
+            emb = model(enc["input_ids"], enc["attention_mask"])
+        all_embeddings.append(emb.float().cpu())
+    return torch.cat(all_embeddings, dim=0)
+def load_model(save_dir):
+    save_dir = Path(save_dir)
+    ckpt_path = save_dir / "model_best.pt"
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint introuvable : {ckpt_path}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained(save_dir)
+    ckpt = torch.load(ckpt_path, map_location=device)
+    saved_cfg = ckpt.get("config", {})
+    cfg = Config(**{k: v for k, v in saved_cfg.items() if hasattr(Config, k)})
+    cfg.vocab_size = tokenizer.vocab_size
+    cfg.use_grad_checkpointing = False
+    model = TextEncoder(cfg).to(device)
+    model.load_state_dict(ckpt["model_state"], strict=False)
+    model.eval()
+    return model, tokenizer, cfg, device
+def search(query, corpus, corpus_embeddings, model, tokenizer, cfg, device, top_k=3):
+    q_emb = encode_texts(
+        model,
+        tokenizer,
+        [query],
+        device,
+        max_seq_len=cfg.max_seq_len,
+    )
+    scores = q_emb @ corpus_embeddings.T
+    top = torch.topk(scores.squeeze(0), k=min(top_k, len(corpus)))
+    results = []
+    for score, idx in zip(top.values, top.indices):
+        results.append(
+            {
+                "score": float(score),
+                "text": corpus[int(idx)],
+            }
+        )
+    return results
+# =============================================================================
+# MAIN
+# =============================================================================
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="./checkpoints_rag_doc_100m",
+        help="Dossier contenant model_best.pt et le tokenizer.",
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=3,
+        help="Nombre de résultats à retourner.",
+    )
+    args = parser.parse_args()
+    model, tokenizer, cfg, device = load_model(args.save_dir)
+    print(f"[INFO] Modèle chargé depuis : {args.save_dir}")
+    print(f"[INFO] Device : {device}")
+    corpus = [
+        "ARTICLE 12 - Les congés payés sont acquis à raison de 2,5 jours par mois travaillé.",
+        "Procédure de validation des notes de frais : transmettre via le portail RH avant le 5 du mois.",
+        "La politique RGPD impose un délai de 72h pour notifier une violation de données.",
+        "Le télétravail est autorisé jusqu'à 3 jours par semaine sur accord du manager.",
+        "Toute facture fournisseur doit être validée par le responsable budget avant paiement.",
+        "Formation obligatoire sécurité incendie : 1 fois par an, traçabilité dans le SIRH.",
+        "L'accord d'entreprise du 15/03/2024 fixe le taux de prime annuelle à 8% du salaire brut.",
+    ]
+    print("[INFO] Encodage du corpus...")
+    corpus_embeddings = encode_texts(
+        model,
+        tokenizer,
+        corpus,
+        device,
+        max_seq_len=cfg.max_seq_len,
+    )
+    queries = [
+        "Combien de jours de congés je gagne par mois ?",
+        "Comment déclarer mes notes de frais ?",
+        "Quel est le quota de télétravail ?",
+        "Quel est le délai de notification RGPD ?",
+        "Quel est le taux de prime annuelle ?",
+    ]
+    print("\n================ TEST RAG DOC INTERNE ================")
+    for q in queries:
+        print(f"\nQuestion : {q}")
+        results = search(
+            query=q,
+            corpus=corpus,
+            corpus_embeddings=corpus_embeddings,
+            model=model,
+            tokenizer=tokenizer,
+            cfg=cfg,
+            device=device,
+            top_k=args.top_k,
+        )
+        for rank, r in enumerate(results, start=1):
+            print(f"  Top {rank} | score={r['score']:.4f}")
+            print(f"  -> {r['text']}")
+if __name__ == "__main__":
+    main()

modeleAIRAG/train1.py ADDED Viewed

	@@ -0,0 +1,780 @@

+"""
+==============================================================================
+  RAG/NLP encoder ~100M params - SPÉCIALISÉ IT / TECH / CYBERSÉCURITÉ
+  Hardware : NVIDIA H100 80GB
+  Epochs   : 20
+==============================================================================
+Architecture :
+  - Encoder Transformer ~100M params (12 couches, hidden=768, 12 têtes)
+  - Tokenizer : camembert-base (32k FR)  + extension domaine via BPE-suffixe
+  - Tête projection -> embeddings 768d L2-normalisés
+  - Loss : Symmetric MNRL + hard negatives (TF-IDF mining)
+  - MLM pré-entraînement (2 epochs) sur corpus IT FR
+  - EMA, LayerScale, BF16, SDPA (Flash Attention 2 sur H100)
+  - Gradient checkpointing ACTIVÉ (modèle 100M, batch large -> VRAM)
+Datasets (IT / cybersécurité / dev / cloud / data) :
+  - mMARCO-FR (passages techniques)
+  - PIAF + FQuAD2 filtrés "tech"
+  - CodeSearchNet (docstrings -> code, FR/EN)
+  - StackExchange dumps (askubuntu, serverfault, security, stackoverflow)
+  - CVE / NVD descriptions (cybersécurité)
+  - OWASP / RFC-like (RFC corpus, MITRE ATT&CK)
+  - HuggingFace : "lhoestq/demo1", "code_search_net"
+  - Custom JSONL local optionnel (./data/custom_it.jsonl)
+Usage :
+  pip install torch>=2.2 transformers>=4.40 datasets>=2.18 accelerate \\
+              sentencepiece tqdm numpy scikit-learn faiss-cpu
+  python train_rag_it_100m.py
+"""
+import os
+import math
+import json
+import random
+import re
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as gc
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
+from datasets import load_dataset, Dataset as HFDataset
+from tqdm.auto import tqdm
+# =============================================================================
+# 1. CONFIG — 100M params, IT/Tech
+# =============================================================================
+@dataclass
+class Config:
+    # --- Modèle ~100M ---
+    vocab_size: int = 32000
+    hidden_size: int = 768
+    num_hidden_layers: int = 12
+    num_attention_heads: int = 12
+    intermediate_size: int = 3072
+    max_position_embeddings: int = 384      # docs IT plus longs
+    hidden_dropout_prob: float = 0.1
+    attention_probs_dropout_prob: float = 0.1
+    layer_norm_eps: float = 1e-12
+    embedding_dim: int = 768
+    use_layer_scale: bool = True
+    layer_scale_init: float = 1e-5
+    use_grad_checkpointing: bool = True     # OBLIGATOIRE à 100M
+    tokenizer_name: str = "camembert-base"
+    # --- MLM pré-entraînement ---
+    do_mlm_pretrain: bool = True
+    mlm_epochs: int = 2
+    mlm_prob: float = 0.15
+    mlm_lr: float = 1e-4
+    # --- Contrastif ---
+    epochs: int = 20
+    batch_size: int = 96                    # 100M + GC -> batch raisonnable
+    grad_accum_steps: int = 4               # batch effectif = 384
+    max_seq_len: int = 192                  # docs IT plus longs
+    lr: float = 2e-5                        # plus bas pour 100M + 20 epochs
+    weight_decay: float = 0.01
+    warmup_ratio: float = 0.04
+    grad_clip: float = 1.0
+    temperature: float = 0.02
+    num_workers: int = 6
+    seed: int = 42
+    # --- Hard negatives ---
+    use_hard_negatives: bool = True
+    n_hard_neg: int = 1
+    hard_neg_pool_size: int = 100_000
+    # --- EMA ---
+    use_ema: bool = True
+    ema_decay: float = 0.9995               # plus agressif pour 20 epochs
+    # --- Données ---
+    max_samples_per_dataset: int = 300_000
+    eval_max_size: int = 5_000
+    # --- Optim H100 ---
+    use_bf16: bool = True
+    use_compile: bool = True
+    compile_mode: str = "default"
+    log_every: int = 50
+    save_dir: str = "./checkpoints_rag_it_100m"
+    save_every_epochs: int = 2              # checkpoint tous les 2 epochs
+    # --- Domaine IT : custom data path ---
+    custom_jsonl_path: str = "./data/custom_it.jsonl"
+CFG = Config()
+Path(CFG.save_dir).mkdir(parents=True, exist_ok=True)
+random.seed(CFG.seed); np.random.seed(CFG.seed)
+torch.manual_seed(CFG.seed); torch.cuda.manual_seed_all(CFG.seed)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision("high")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"[INFO] Device : {device}")
+if torch.cuda.is_available():
+    print(f"[INFO] GPU    : {torch.cuda.get_device_name(0)}")
+    print(f"[INFO] VRAM   : {torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB")
+# =============================================================================
+# 2. ARCHITECTURE — 100M avec Gradient Checkpointing
+# =============================================================================
+class TransformerEncoderBlock(nn.Module):
+    def __init__(self, cfg: Config):
+        super().__init__()
+        self.num_heads = cfg.num_attention_heads
+        self.head_dim = cfg.hidden_size // cfg.num_attention_heads
+        self.ln1 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.qkv = nn.Linear(cfg.hidden_size, 3 * cfg.hidden_size, bias=True)
+        self.proj = nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)
+        self.attn_drop_p = cfg.attention_probs_dropout_prob
+        self.ln2 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.intermediate_size),
+            nn.GELU(),
+            nn.Linear(cfg.intermediate_size, cfg.hidden_size),
+            nn.Dropout(cfg.hidden_dropout_prob),
+        )
+        self.resid_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.use_ls = cfg.use_layer_scale
+        if cfg.use_layer_scale:
+            self.gamma1 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+            self.gamma2 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+    def forward(self, x, attn_mask):
+        B, T, C = x.shape
+        h = self.ln1(x)
+        qkv = self.qkv(h).view(B, T, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
+        key_padding_mask = attn_mask[:, None, None, :].bool()
+        attn_out = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=key_padding_mask,
+            dropout_p=self.attn_drop_p if self.training else 0.0,
+            is_causal=False,
+        )
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, T, C)
+        attn_out = self.resid_drop(self.proj(attn_out))
+        if self.use_ls: attn_out = attn_out * self.gamma1
+        x = x + attn_out
+        mlp_out = self.mlp(self.ln2(x))
+        if self.use_ls: mlp_out = mlp_out * self.gamma2
+        return x + mlp_out
+class TextEncoder(nn.Module):
+    def __init__(self, cfg: Config):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.hidden_size, padding_idx=0)
+        self.pos_emb = nn.Embedding(cfg.max_position_embeddings, cfg.hidden_size)
+        self.emb_ln = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.emb_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.blocks = nn.ModuleList([TransformerEncoderBlock(cfg)
+                                     for _ in range(cfg.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.proj_head = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.hidden_size),
+            nn.Tanh(),
+            nn.Linear(cfg.hidden_size, cfg.embedding_dim),
+        )
+        self.mlm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
+        self.mlm_head.weight = self.tok_emb.weight  # tied
+        self.use_gc = cfg.use_grad_checkpointing
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.bias is not None: nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, std=0.02)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.ones_(m.weight); nn.init.zeros_(m.bias)
+    def encode_backbone(self, input_ids, attention_mask):
+        B, T = input_ids.shape
+        positions = torch.arange(T, device=input_ids.device).unsqueeze(0).expand(B, T)
+        x = self.tok_emb(input_ids) + self.pos_emb(positions)
+        x = self.emb_drop(self.emb_ln(x))
+        for blk in self.blocks:
+            if self.use_gc and self.training:
+                x = gc.checkpoint(blk, x, attention_mask, use_reentrant=False)
+            else:
+                x = blk(x, attention_mask)
+        return self.ln_f(x)
+    def forward(self, input_ids, attention_mask):
+        x = self.encode_backbone(input_ids, attention_mask)
+        mask = attention_mask.unsqueeze(-1).float()
+        pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
+        emb = self.proj_head(pooled)
+        return F.normalize(emb, p=2, dim=-1)
+    def forward_mlm(self, input_ids, attention_mask):
+        x = self.encode_backbone(input_ids, attention_mask)
+        return self.mlm_head(x)
+def count_parameters(model: nn.Module) -> int:
+    return sum(p.numel() for n, p in model.named_parameters()
+               if p.requires_grad and "mlm_head" not in n)
+# =============================================================================
+# 3. EMA
+# =============================================================================
+class EMA:
+    def __init__(self, model: nn.Module, decay: float = 0.999):
+        self.decay = decay
+        self.shadow = {n: p.detach().clone()
+                       for n, p in model.named_parameters() if p.requires_grad}
+    @torch.no_grad()
+    def update(self, model):
+        for n, p in model.named_parameters():
+            if p.requires_grad and n in self.shadow:
+                self.shadow[n].mul_(self.decay).add_(p.detach(), alpha=1.0 - self.decay)
+    @torch.no_grad()
+    def apply_to(self, model):
+        backup = {}
+        for n, p in model.named_parameters():
+            if n in self.shadow:
+                backup[n] = p.detach().clone()
+                p.copy_(self.shadow[n])
+        return backup
+    @torch.no_grad()
+    def restore(self, model, backup):
+        for n, p in model.named_parameters():
+            if n in backup: p.copy_(backup[n])
+# =============================================================================
+# 4. CHARGEMENT DES DATASETS — DOMAINE IT / TECH
+# =============================================================================
+IT_KEYWORDS = re.compile(
+    r"\b(api|cloud|docker|kubernetes|server|réseau|network|sécurité|security|"
+    r"vuln|attaque|attack|cve|owasp|sql|nosql|python|java|javascript|linux|"
+    r"windows|firewall|chiffr|crypto|http|tcp|ip|dns|vpn|tls|ssl|iam|oauth|"
+    r"jwt|microservice|devops|ci/cd|pipeline|kernel|conteneur|container|"
+    r"machine learning|deep learning|llm|nlp|rag|gpu|cuda|pytorch|tensorflow|"
+    r"hadoop|spark|sql|bdd|database|données|data|backup|sauvegarde)\b",
+    re.IGNORECASE,
+)
+def is_it_text(t: str) -> bool:
+    return bool(IT_KEYWORDS.search(t)) if t else False
+def load_it_pairs(cfg: Config) -> List[Dict[str, str]]:
+    print("\n[DATA] Chargement des datasets IT/Tech...")
+    pairs: List[Dict[str, str]] = []
+    # 4.1 mMARCO FR filtré IT
+    try:
+        ds = load_dataset("unicamp-dl/mmarco", "french", split="train")
+        ds = ds.select(range(min(500_000, len(ds))))
+        kept = 0
+        for ex in tqdm(ds, desc="mMARCO-FR (IT-filter)"):
+            q = (ex.get("query") or "").strip()
+            p = (ex.get("positive") or ex.get("passage") or "").strip()
+            if q and p and (is_it_text(q) or is_it_text(p)):
+                pairs.append({"anchor": q, "positive": p})
+                kept += 1
+                if kept >= cfg.max_samples_per_dataset: break
+    except Exception as e:
+        print(f"  [warn] mMARCO FR : {e}")
+    # 4.2 PIAF filtré IT
+    try:
+        ds = load_dataset("etalab-ia/piaf", split="train")
+        for ex in tqdm(ds, desc="PIAF (IT-filter)"):
+            q = (ex.get("question") or "").strip()
+            ctx = (ex.get("context") or "").strip()
+            if q and ctx and (is_it_text(q) or is_it_text(ctx)):
+                pairs.append({"anchor": q, "positive": ctx})
+    except Exception as e:
+        print(f"  [warn] PIAF : {e}")
+    # 4.3 CodeSearchNet — docstring -> code (Python, JS, Go, Java)
+    for lang in ["python", "javascript", "java", "go"]:
+        try:
+            ds = load_dataset("code_search_net", lang, split="train",
+                              trust_remote_code=True)
+            ds = ds.select(range(min(80_000, len(ds))))
+            for ex in tqdm(ds, desc=f"CodeSearchNet-{lang}"):
+                doc = (ex.get("func_documentation_string") or "").strip()
+                code = (ex.get("func_code_string") or "").strip()
+                if doc and code and len(doc) > 20 and len(code) > 30:
+                    pairs.append({"anchor": doc, "positive": code[:1500]})
+        except Exception as e:
+            print(f"  [warn] CodeSearchNet-{lang} : {e}")
+    # 4.4 StackExchange — Q/A techniques (security, serverfault, askubuntu)
+    for sub in ["security", "serverfault", "askubuntu", "stackoverflow"]:
+        try:
+            ds = load_dataset("flax-sentence-embeddings/stackexchange_xml",
+                              sub, split="train", trust_remote_code=True)
+            ds = ds.select(range(min(60_000, len(ds))))
+            for ex in tqdm(ds, desc=f"SE-{sub}"):
+                title = (ex.get("title_body") or ex.get("title") or "").strip()
+                ans = (ex.get("upvoted_answer") or ex.get("answer") or "").strip()
+                if title and ans and len(ans) > 50:
+                    pairs.append({"anchor": title, "positive": ans[:1500]})
+        except Exception as e:
+            print(f"  [warn] SE-{sub} : {e}")
+    # 4.5 CVE / NVD descriptions (cybersécurité)
+    try:
+        ds = load_dataset("Iker/CVE-Description-and-Severity", split="train")
+        for ex in tqdm(ds, desc="CVE-NVD"):
+            cve_id = (ex.get("cve") or "").strip()
+            desc = (ex.get("description") or "").strip()
+            if cve_id and desc and len(desc) > 30:
+                # paire (cve_id + question implicite, description)
+                pairs.append({
+                    "anchor": f"Quelle est la vulnérabilité {cve_id} ?",
+                    "positive": desc[:1500],
+                })
+    except Exception as e:
+        print(f"  [warn] CVE : {e}")
+    # 4.6 XNLI FR (entailment) - filtré IT
+    try:
+        ds = load_dataset("xnli", "fr", split="train")
+        ds = ds.filter(lambda x: x["label"] == 0)
+        for ex in tqdm(ds, desc="XNLI-FR (IT)"):
+            a = (ex.get("premise") or "").strip()
+            b = (ex.get("hypothesis") or "").strip()
+            if a and b and (is_it_text(a) or is_it_text(b)):
+                pairs.append({"anchor": a, "positive": b})
+    except Exception as e:
+        print(f"  [warn] XNLI : {e}")
+    # 4.7 Custom JSONL local (corpus interne SecureRAG / OWASP / RFC)
+    if Path(cfg.custom_jsonl_path).exists():
+        print(f"  [+] Lecture custom : {cfg.custom_jsonl_path}")
+        with open(cfg.custom_jsonl_path, "r", encoding="utf-8") as f:
+            for line in tqdm(f, desc="custom_it.jsonl"):
+                try:
+                    ex = json.loads(line)
+                    a = (ex.get("anchor") or ex.get("query") or "").strip()
+                    p = (ex.get("positive") or ex.get("passage") or "").strip()
+                    if a and p:
+                        pairs.append({"anchor": a, "positive": p})
+                except Exception:
+                    continue
+    else:
+        print(f"  [info] Pas de fichier custom à {cfg.custom_jsonl_path}")
+    # Dédoublonnage
+    seen = set(); uniq = []
+    for p in pairs:
+        k = (p["anchor"][:200], p["positive"][:200])
+        if k not in seen:
+            seen.add(k); uniq.append(p)
+    random.shuffle(uniq)
+    print(f"[DATA] Total paires IT uniques : {len(uniq):,}")
+    return uniq
+# =============================================================================
+# 5. HARD NEGATIVE MINING
+# =============================================================================
+def mine_hard_negatives(pairs: List[Dict[str, str]], cfg: Config) -> List[Dict[str, str]]:
+    print("\n[HN] Mining hard negatives via TF-IDF...")
+    try:
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.metrics.pairwise import linear_kernel
+    except ImportError:
+        print("  [warn] sklearn manquant"); return pairs
+    n = len(pairs)
+    pool_size = min(cfg.hard_neg_pool_size, n)
+    pool_idx = np.random.choice(n, size=pool_size, replace=False)
+    pool_pass = [pairs[i]["positive"] for i in pool_idx]
+    vec = TfidfVectorizer(max_features=80_000, ngram_range=(1, 2),
+                          lowercase=True, strip_accents="unicode")
+    X_pool = vec.fit_transform(pool_pass)
+    enriched = []
+    batch = 2000
+    anchors = [p["anchor"] for p in pairs]
+    for start in tqdm(range(0, n, batch), desc="HN-mine"):
+        end = min(start + batch, n)
+        Xq = vec.transform(anchors[start:end])
+        sims = linear_kernel(Xq, X_pool)
+        for i_loc, i_glob in enumerate(range(start, end)):
+            true_pos = pairs[i_glob]["positive"]
+            order = np.argsort(-sims[i_loc])
+            picked = None
+            for j in order[:30]:
+                if pool_pass[j] != true_pos:
+                    picked = pool_pass[j]; break
+            if picked is None: picked = pool_pass[order[0]]
+            enriched.append({
+                "anchor": pairs[i_glob]["anchor"],
+                "positive": pairs[i_glob]["positive"],
+                "hard_neg": picked,
+            })
+    return enriched
+# =============================================================================
+# 6. DATASET / COLLATE
+# =============================================================================
+class PairDataset(Dataset):
+    def __init__(self, items, with_hn): self.items, self.with_hn = items, with_hn
+    def __len__(self): return len(self.items)
+    def __getitem__(self, i):
+        ex = self.items[i]
+        if self.with_hn:
+            return ex["anchor"], ex["positive"], ex.get("hard_neg", ex["positive"])
+        return ex["anchor"], ex["positive"]
+def make_collate_fn(tokenizer, max_len, with_hn):
+    def collate(batch):
+        a_list = [b[0] for b in batch]
+        p_list = [b[1] for b in batch]
+        a = tokenizer(a_list, padding=True, truncation=True,
+                      max_length=max_len, return_tensors="pt")
+        p = tokenizer(p_list, padding=True, truncation=True,
+                      max_length=max_len, return_tensors="pt")
+        if with_hn:
+            n_list = [b[2] for b in batch]
+            n = tokenizer(n_list, padding=True, truncation=True,
+                          max_length=max_len, return_tensors="pt")
+            return a, p, n
+        return a, p
+    return collate
+# =============================================================================
+# 7. LOSS
+# =============================================================================
+def symmetric_mnrl_loss(emb_a, emb_p, emb_n=None, temperature=0.02):
+    N = emb_a.size(0)
+    labels = torch.arange(N, device=emb_a.device)
+    if emb_n is not None:
+        targets = torch.cat([emb_p, emb_n], dim=0)
+        sim_a = emb_a @ targets.t() / temperature
+        loss_a2p = F.cross_entropy(sim_a, labels)
+    else:
+        sim_a = emb_a @ emb_p.t() / temperature
+        loss_a2p = F.cross_entropy(sim_a, labels)
+    sim_p = emb_p @ emb_a.t() / temperature
+    loss_p2a = F.cross_entropy(sim_p, labels)
+    loss = 0.5 * (loss_a2p + loss_p2a)
+    with torch.no_grad():
+        acc = (sim_a[:, :N].argmax(dim=1) == labels).float().mean().item()
+    return loss, acc
+# =============================================================================
+# 8. MLM PRÉ-ENTRAÎNEMENT
+# =============================================================================
+def mlm_pretrain(model, tokenizer, texts, cfg: Config):
+    print(f"\n[MLM] Pré-entraînement sur {len(texts):,} textes IT...")
+    class MLMDataset(Dataset):
+        def __init__(self, t): self.t = t
+        def __len__(self): return len(self.t)
+        def __getitem__(self, i): return self.t[i]
+    def mlm_collate(batch):
+        enc = tokenizer(batch, padding=True, truncation=True,
+                        max_length=cfg.max_seq_len, return_tensors="pt")
+        ids = enc["input_ids"].clone()
+        labels = ids.clone()
+        special = torch.zeros_like(ids, dtype=torch.bool)
+        for sid in tokenizer.all_special_ids:
+            special |= (ids == sid)
+        prob = torch.full(ids.shape, cfg.mlm_prob)
+        prob.masked_fill_(special, 0.0)
+        masked = torch.bernoulli(prob).bool()
+        labels[~masked] = -100
+        rand = torch.rand(ids.shape)
+        ids[masked & (rand < 0.8)] = tokenizer.mask_token_id
+        replace_rand = masked & (rand >= 0.8) & (rand < 0.9)
+        rand_tokens = torch.randint(0, tokenizer.vocab_size, ids.shape)
+        ids[replace_rand] = rand_tokens[replace_rand]
+        return ids, enc["attention_mask"], labels
+    loader = DataLoader(MLMDataset(texts), batch_size=cfg.batch_size,
+                        shuffle=True, num_workers=cfg.num_workers,
+                        collate_fn=mlm_collate, pin_memory=True,
+                        drop_last=True, persistent_workers=True)
+    optim = AdamW(model.parameters(), lr=cfg.mlm_lr, weight_decay=0.01,
+                  betas=(0.9, 0.98), eps=1e-6)
+    total_steps = len(loader) * cfg.mlm_epochs
+    sched = get_cosine_schedule_with_warmup(optim, int(total_steps * 0.04), total_steps)
+    model.train()
+    autocast_dtype = torch.bfloat16 if cfg.use_bf16 else torch.float16
+    for ep in range(cfg.mlm_epochs):
+        running = 0.0
+        pbar = tqdm(loader, desc=f"MLM ep{ep+1}/{cfg.mlm_epochs}")
+        for step, (ids, mask, labels) in enumerate(pbar, 1):
+            ids = ids.to(device, non_blocking=True)
+            mask = mask.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
+            optim.zero_grad(set_to_none=True)
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                logits = model.forward_mlm(ids, mask)
+                loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
+                                       labels.view(-1), ignore_index=-100)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optim.step(); sched.step()
+            running += loss.item()
+            if step % 50 == 0:
+                pbar.set_postfix(loss=f"{running/step:.4f}",
+                                 ppl=f"{math.exp(min(20, running/step)):.1f}")
+    print("[MLM] Terminé.\n")
+# =============================================================================
+# 9. EVAL
+# =============================================================================
+@torch.no_grad()
+def evaluate_retrieval(model, tokenizer, eval_pairs, cfg: Config):
+    model.eval()
+    autocast_dtype = torch.bfloat16 if cfg.use_bf16 else torch.float16
+    queries = [e["anchor"] for e in eval_pairs]
+    passages = [e["positive"] for e in eval_pairs]
+    def encode(texts):
+        embs = []
+        for i in range(0, len(texts), 64):
+            chunk = texts[i:i+64]
+            enc = tokenizer(chunk, padding=True, truncation=True,
+                            max_length=cfg.max_seq_len, return_tensors="pt").to(device)
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                e = model(enc["input_ids"], enc["attention_mask"])
+            embs.append(e.float())
+        return torch.cat(embs, dim=0)
+    Q = encode(queries); P = encode(passages)
+    sims = Q @ P.t()
+    N = sims.size(0)
+    targets = torch.arange(N, device=sims.device)
+    ranks = sims.argsort(dim=1, descending=True)
+    pos_in_rank = (ranks == targets.unsqueeze(1)).nonzero()[:, 1]
+    return {
+        "R@1": (pos_in_rank == 0).float().mean().item(),
+        "R@5": (pos_in_rank < 5).float().mean().item(),
+        "R@10": (pos_in_rank < 10).float().mean().item(),
+        "MRR": (1.0 / (pos_in_rank.float() + 1)).mean().item(),
+    }
+# =============================================================================
+# 10. TRAIN
+# =============================================================================
+def train():
+    tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name)
+    CFG.vocab_size = tokenizer.vocab_size
+    print(f"[TOK ] vocab_size = {CFG.vocab_size}")
+    items_all = load_it_pairs(CFG)
+    n_eval = min(CFG.eval_max_size, max(2000, int(len(items_all) * 0.005)))
+    eval_items = items_all[:n_eval]
+    train_items = items_all[n_eval:]
+    print(f"[DATA] train={len(train_items):,}  eval={len(eval_items):,}")
+    if CFG.use_hard_negatives:
+        train_items = mine_hard_negatives(train_items, CFG)
+    collate = make_collate_fn(tokenizer, CFG.max_seq_len, CFG.use_hard_negatives)
+    train_loader = DataLoader(
+        PairDataset(train_items, CFG.use_hard_negatives),
+        batch_size=CFG.batch_size, shuffle=True,
+        num_workers=CFG.num_workers, collate_fn=collate,
+        pin_memory=True, drop_last=True, persistent_workers=True,
+    )
+    model = TextEncoder(CFG).to(device)
+    n_params = count_parameters(model)
+    print(f"[MODEL] Paramètres entraînables : {n_params/1e6:.2f} M")
+    if CFG.do_mlm_pretrain:
+        mlm_texts = []
+        for it in train_items[:400_000]:
+            mlm_texts.append(it["anchor"]); mlm_texts.append(it["positive"])
+        random.shuffle(mlm_texts)
+        mlm_pretrain(model, tokenizer, mlm_texts, CFG)
+    if CFG.use_compile and hasattr(torch, "compile"):
+        print(f"[MODEL] torch.compile(mode={CFG.compile_mode!r})")
+        model = torch.compile(model, mode=CFG.compile_mode)
+    raw_model = model._orig_mod if hasattr(model, "_orig_mod") else model
+    ema = EMA(raw_model, decay=CFG.ema_decay) if CFG.use_ema else None
+    no_decay = ["bias", "LayerNorm.weight", "ln1", "ln2", "ln_f", "emb_ln",
+                "gamma1", "gamma2"]
+    grouped = [
+        {"params": [p for n, p in model.named_parameters()
+                    if "mlm_head" not in n and not any(nd in n for nd in no_decay)],
+         "weight_decay": CFG.weight_decay},
+        {"params": [p for n, p in model.named_parameters()
+                    if "mlm_head" not in n and any(nd in n for nd in no_decay)],
+         "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(grouped, lr=CFG.lr, betas=(0.9, 0.98), eps=1e-6)
+    steps_per_epoch = len(train_loader) // CFG.grad_accum_steps
+    total_steps = steps_per_epoch * CFG.epochs
+    warmup_steps = int(total_steps * CFG.warmup_ratio)
+    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
+    print(f"[OPTIM] total_steps={total_steps}  warmup={warmup_steps}")
+    autocast_dtype = torch.bfloat16 if CFG.use_bf16 else torch.float16
+    best_mrr = 0.0
+    history = []
+    for epoch in range(1, CFG.epochs + 1):
+        model.train()
+        running_loss = running_acc = 0.0
+        n_seen = 0
+        pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{CFG.epochs}")
+        optimizer.zero_grad(set_to_none=True)
+        for step, batch in enumerate(pbar, start=1):
+            if CFG.use_hard_negatives:
+                a, p, hn = batch
+                hn = {k: v.to(device, non_blocking=True) for k, v in hn.items()}
+            else:
+                a, p = batch; hn = None
+            a = {k: v.to(device, non_blocking=True) for k, v in a.items()}
+            p = {k: v.to(device, non_blocking=True) for k, v in p.items()}
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                emb_a = model(a["input_ids"], a["attention_mask"])
+                emb_p = model(p["input_ids"], p["attention_mask"])
+                emb_n = (model(hn["input_ids"], hn["attention_mask"])
+                         if hn is not None else None)
+                loss, acc = symmetric_mnrl_loss(emb_a, emb_p, emb_n, CFG.temperature)
+                loss = loss / CFG.grad_accum_steps
+            loss.backward()
+            if step % CFG.grad_accum_steps == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
+                optimizer.step(); scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+                if ema is not None: ema.update(raw_model)
+            running_loss += loss.item() * CFG.grad_accum_steps
+            running_acc += acc; n_seen += 1
+            if step % CFG.log_every == 0:
+                pbar.set_postfix(loss=f"{running_loss/n_seen:.4f}",
+                                 acc=f"{running_acc/n_seen:.3f}",
+                                 lr=f"{scheduler.get_last_lr()[0]:.2e}")
+        # Eval
+        backup = ema.apply_to(raw_model) if ema is not None else None
+        metrics = evaluate_retrieval(model, tokenizer, eval_items, CFG)
+        if backup is not None: ema.restore(raw_model, backup)
+        print(f"\n[EVAL] epoch {epoch} : R@1={metrics['R@1']:.3f}  "
+              f"R@5={metrics['R@5']:.3f}  R@10={metrics['R@10']:.3f}  "
+              f"MRR={metrics['MRR']:.3f}")
+        history.append({"epoch": epoch, **metrics,
+                        "train_loss": running_loss / max(1, n_seen)})
+        # Sauvegarde
+        is_best = metrics["MRR"] > best_mrr
+        if is_best: best_mrr = metrics["MRR"]
+        if ema is not None: backup = ema.apply_to(raw_model)
+        state = {k: v for k, v in raw_model.state_dict().items() if "mlm_head" not in k}
+        if epoch % CFG.save_every_epochs == 0 or is_best or epoch == CFG.epochs:
+            torch.save({"epoch": epoch, "model_state": state,
+                        "config": asdict(CFG), "metrics": metrics},
+                       Path(CFG.save_dir) / f"model_epoch{epoch}.pt")
+        if is_best:
+            torch.save({"epoch": epoch, "model_state": state,
+                        "config": asdict(CFG), "metrics": metrics},
+                       Path(CFG.save_dir) / "model_best.pt")
+        if ema is not None: ema.restore(raw_model, backup)
+        print(f"[SAVE] epoch {epoch}  best={'oui' if is_best else 'non'}")
+    with open(Path(CFG.save_dir) / "history.json", "w", encoding="utf-8") as f:
+        json.dump(history, f, ensure_ascii=False, indent=2)
+    tokenizer.save_pretrained(CFG.save_dir)
+    print(f"\n[OK] Best MRR = {best_mrr:.3f} -> {CFG.save_dir}/model_best.pt")
+# =============================================================================
+# 11. DÉMO
+# =============================================================================
+@torch.no_grad()
+def demo():
+    tokenizer = AutoTokenizer.from_pretrained(CFG.save_dir)
+    ckpt = torch.load(Path(CFG.save_dir) / "model_best.pt", map_location=device)
+    saved_cfg = ckpt["config"]
+    cfg2 = Config(**{k: v for k, v in saved_cfg.items() if hasattr(Config, k)})
+    cfg2.vocab_size = tokenizer.vocab_size
+    model = TextEncoder(cfg2).to(device).eval()
+    model.load_state_dict(ckpt["model_state"], strict=False)
+    corpus = [
+        "OWASP LLM Top 10 liste les vulnérabilités des modèles de langage.",
+        "La prompt injection consiste à manipuler les instructions d'un LLM.",
+        "Le H100 NVIDIA est un GPU IA avec 80 Go HBM3.",
+        "Docker permet de conteneuriser des applications.",
+        "Kubernetes orchestre des conteneurs à grande échelle.",
+        "Le chiffrement AES-256 est utilisé pour protéger les données.",
+        "Une attaque SQL injection exploite des requêtes mal échappées.",
+        "Le RAG combine retriever vectoriel et LLM générateur.",
+    ]
+    queries = [
+        "Quelles sont les vulnérabilités des LLM ?",
+        "Comment orchestrer des conteneurs ?",
+        "Quel GPU pour entraîner une IA ?",
+    ]
+    enc_corpus = tokenizer(corpus, padding=True, truncation=True,
+                           max_length=cfg2.max_seq_len, return_tensors="pt").to(device)
+    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        c_emb = model(enc_corpus["input_ids"], enc_corpus["attention_mask"])
+    print("\n[DEMO IT-100M]")
+    for q in queries:
+        eq = tokenizer([q], padding=True, truncation=True,
+                       max_length=cfg2.max_seq_len, return_tensors="pt").to(device)
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            q_emb = model(eq["input_ids"], eq["attention_mask"])
+        sims = (q_emb @ c_emb.t()).squeeze(0)
+        top = sims.topk(3)
+        print(f"\nQ : {q}")
+        for s, i in zip(top.values, top.indices):
+            print(f"  ({s.item():.3f}) -> {corpus[i.item()]}")
+if __name__ == "__main__":
+    train()
+    try:
+        demo()
+    except Exception as e:
+        print(f"[demo] {e}")

modeleAIRAG/train2.py ADDED Viewed

	@@ -0,0 +1,921 @@

+"""
+==============================================================================
+  RAG/NLP encoder ~100M params - SPÉCIALISÉ DOCUMENTAIRE INTERNE ENTREPRISE
+  (RH, juridique, procédures, comptabilité, qualité, conformité, formation)
+  Hardware : NVIDIA H100 80GB
+  Epochs   : 20
+==============================================================================
+Spécificités vs version IT :
+  - max_seq_len = 384 (documents internes longs : procédures, contrats)
+  - Filtres lexicaux orientés "entreprise / documentation"
+  - Datasets : Common Crawl FR (filtré), Wikipédia FR (catégories doc),
+              FQuAD/PIAF (questions admin/juridique), MultiLegalPile-FR,
+              corpus interne JSONL (priorité absolue)
+  - Augmentation : "title -> contenu" et "section -> paragraphe"
+  - Loss : MNRL symétrique + 2 hard negatives par paire
+  - Pré-entraînement MLM sur corpus interne en priorité
+  - EMA decay 0.9995, LayerScale, BF16, SDPA, Gradient Checkpointing
+  - 20 epochs, batch effectif 384
+Architecture identique 100M params (12L, 768d, 12H, FFN=3072).
+Usage :
+  pip install torch>=2.2 transformers>=4.40 datasets>=2.18 accelerate \\
+              sentencepiece tqdm numpy scikit-learn faiss-cpu beautifulsoup4
+  python train_rag_doc_interne_100m.py
+Préparation du corpus interne :
+  Place tes documents dans ./data/corpus_interne/ (PDF/DOCX/TXT/MD)
+  Ou directement un JSONL ./data/custom_doc.jsonl avec {"anchor","positive"}
+"""
+import os
+import math
+import json
+import random
+import re
+import glob
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as gc
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
+from datasets import load_dataset, Dataset as HFDataset
+from tqdm.auto import tqdm
+# =============================================================================
+# 1. CONFIG — 100M, Documentaire interne
+# =============================================================================
+@dataclass
+class Config:
+    # --- Modèle ~100M ---
+    vocab_size: int = 32000
+    hidden_size: int = 768
+    num_hidden_layers: int = 12
+    num_attention_heads: int = 12
+    intermediate_size: int = 3072
+    max_position_embeddings: int = 512      # docs longs
+    hidden_dropout_prob: float = 0.1
+    attention_probs_dropout_prob: float = 0.1
+    layer_norm_eps: float = 1e-12
+    embedding_dim: int = 768
+    use_layer_scale: bool = True
+    layer_scale_init: float = 1e-5
+    use_grad_checkpointing: bool = True
+    tokenizer_name: str = "camembert-base"
+    # --- MLM (priorité corpus interne) ---
+    do_mlm_pretrain: bool = True
+    mlm_epochs: int = 3                     # +1 vs IT, doc interne plus rare
+    mlm_prob: float = 0.15
+    mlm_lr: float = 1e-4
+    # --- Contrastif ---
+    epochs: int = 20
+    batch_size: int = 64                    # seq_len 384 -> batch + petit
+    grad_accum_steps: int = 6               # effectif = 384
+    max_seq_len: int = 384
+    lr: float = 2e-5
+    weight_decay: float = 0.01
+    warmup_ratio: float = 0.05
+    grad_clip: float = 1.0
+    temperature: float = 0.02
+    num_workers: int = 6
+    seed: int = 42
+    # --- Hard negatives (2 par paire pour doc interne) ---
+    use_hard_negatives: bool = True
+    n_hard_neg: int = 2                     # plus fort
+    hard_neg_pool_size: int = 100_000
+    use_ema: bool = True
+    ema_decay: float = 0.9995
+    max_samples_per_dataset: int = 250_000
+    eval_max_size: int = 5_000
+    use_bf16: bool = True
+    use_compile: bool = True
+    compile_mode: str = "default"
+    log_every: int = 50
+    save_dir: str = "./checkpoints_rag_doc_100m"
+    save_every_epochs: int = 2
+    # --- Corpus interne ---
+    custom_jsonl_path: str = "./data/custom_doc.jsonl"
+    custom_corpus_dir: str = "./data/corpus_interne"  # PDF/DOCX/TXT/MD
+    internal_oversample: int = 5            # x5 pour booster apprentissage interne
+CFG = Config()
+Path(CFG.save_dir).mkdir(parents=True, exist_ok=True)
+random.seed(CFG.seed); np.random.seed(CFG.seed)
+torch.manual_seed(CFG.seed); torch.cuda.manual_seed_all(CFG.seed)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision("high")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"[INFO] Device : {device}")
+if torch.cuda.is_available():
+    print(f"[INFO] GPU    : {torch.cuda.get_device_name(0)}")
+    print(f"[INFO] VRAM   : {torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB")
+# =============================================================================
+# 2. ARCHITECTURE
+# =============================================================================
+class TransformerEncoderBlock(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.num_heads = cfg.num_attention_heads
+        self.head_dim = cfg.hidden_size // cfg.num_attention_heads
+        self.ln1 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.qkv = nn.Linear(cfg.hidden_size, 3 * cfg.hidden_size, bias=True)
+        self.proj = nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)
+        self.attn_drop_p = cfg.attention_probs_dropout_prob
+        self.ln2 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.intermediate_size),
+            nn.GELU(),
+            nn.Linear(cfg.intermediate_size, cfg.hidden_size),
+            nn.Dropout(cfg.hidden_dropout_prob),
+        )
+        self.resid_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.use_ls = cfg.use_layer_scale
+        if cfg.use_layer_scale:
+            self.gamma1 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+            self.gamma2 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+    def forward(self, x, attn_mask):
+        B, T, C = x.shape
+        h = self.ln1(x)
+        qkv = self.qkv(h).view(B, T, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
+        kpm = attn_mask[:, None, None, :].bool()
+        a = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=kpm,
+            dropout_p=self.attn_drop_p if self.training else 0.0,
+            is_causal=False)
+        a = a.transpose(1, 2).contiguous().view(B, T, C)
+        a = self.resid_drop(self.proj(a))
+        if self.use_ls: a = a * self.gamma1
+        x = x + a
+        m = self.mlp(self.ln2(x))
+        if self.use_ls: m = m * self.gamma2
+        return x + m
+class TextEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.hidden_size, padding_idx=0)
+        self.pos_emb = nn.Embedding(cfg.max_position_embeddings, cfg.hidden_size)
+        self.emb_ln = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.emb_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.blocks = nn.ModuleList([TransformerEncoderBlock(cfg)
+                                     for _ in range(cfg.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.proj_head = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.hidden_size),
+            nn.Tanh(),
+            nn.Linear(cfg.hidden_size, cfg.embedding_dim),
+        )
+        self.mlm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
+        self.mlm_head.weight = self.tok_emb.weight
+        self.use_gc = cfg.use_grad_checkpointing
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.bias is not None: nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, std=0.02)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.ones_(m.weight); nn.init.zeros_(m.bias)
+    def encode_backbone(self, ids, mask):
+        B, T = ids.shape
+        pos = torch.arange(T, device=ids.device).unsqueeze(0).expand(B, T)
+        x = self.tok_emb(ids) + self.pos_emb(pos)
+        x = self.emb_drop(self.emb_ln(x))
+        for blk in self.blocks:
+            if self.use_gc and self.training:
+                x = gc.checkpoint(blk, x, mask, use_reentrant=False)
+            else:
+                x = blk(x, mask)
+        return self.ln_f(x)
+    def forward(self, ids, mask):
+        x = self.encode_backbone(ids, mask)
+        m = mask.unsqueeze(-1).float()
+        pooled = (x * m).sum(dim=1) / m.sum(dim=1).clamp(min=1e-6)
+        emb = self.proj_head(pooled)
+        return F.normalize(emb, p=2, dim=-1)
+    def forward_mlm(self, ids, mask):
+        return self.mlm_head(self.encode_backbone(ids, mask))
+def count_parameters(model):
+    return sum(p.numel() for n, p in model.named_parameters()
+               if p.requires_grad and "mlm_head" not in n)
+# =============================================================================
+# 3. EMA
+# =============================================================================
+class EMA:
+    def __init__(self, model, decay=0.999):
+        self.decay = decay
+        self.shadow = {n: p.detach().clone()
+                       for n, p in model.named_parameters() if p.requires_grad}
+    @torch.no_grad()
+    def update(self, model):
+        for n, p in model.named_parameters():
+            if p.requires_grad and n in self.shadow:
+                self.shadow[n].mul_(self.decay).add_(p.detach(), alpha=1.0 - self.decay)
+    @torch.no_grad()
+    def apply_to(self, model):
+        backup = {}
+        for n, p in model.named_parameters():
+            if n in self.shadow:
+                backup[n] = p.detach().clone(); p.copy_(self.shadow[n])
+        return backup
+    @torch.no_grad()
+    def restore(self, model, backup):
+        for n, p in model.named_parameters():
+            if n in backup: p.copy_(backup[n])
+# =============================================================================
+# 4. EXTRACTION CORPUS INTERNE (PDF / DOCX / TXT / MD)
+# =============================================================================
+def extract_text_from_file(path: Path) -> str:
+    """Extracteur multi-format. Retourne texte brut ou ''."""
+    suffix = path.suffix.lower()
+    try:
+        if suffix in {".txt", ".md"}:
+            return path.read_text(encoding="utf-8", errors="ignore")
+        if suffix == ".pdf":
+            try:
+                from pypdf import PdfReader
+            except ImportError:
+                from PyPDF2 import PdfReader
+            reader = PdfReader(str(path))
+            return "\n".join((p.extract_text() or "") for p in reader.pages)
+        if suffix == ".docx":
+            from docx import Document
+            doc = Document(str(path))
+            return "\n".join(p.text for p in doc.paragraphs)
+        if suffix in {".html", ".htm"}:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(path.read_text(encoding="utf-8", errors="ignore"),
+                                 "html.parser")
+            return soup.get_text(separator="\n")
+    except Exception as e:
+        print(f"  [warn] extract {path.name} : {e}")
+    return ""
+def chunk_document(text: str, chunk_size: int = 1500,
+                   overlap: int = 200) -> List[Tuple[str, str]]:
+    """
+    Découpe un document en (titre/section, contenu) pour générer des paires.
+    Utilise les titres Markdown / numérotation pour détecter les sections.
+    """
+    text = re.sub(r"\n{3,}", "\n\n", text).strip()
+    if not text:
+        return []
+    # Détection sections (Markdown ##, numérotation 1., 1.1, ARTICLE, etc.)
+    section_re = re.compile(
+        r"(?m)^(#{1,4}\s+.+|"                          # markdown
+        r"\d+(?:\.\d+)*\.?\s+[A-ZÀ-Ÿa-zà-ÿ].+|"        # numérotation
+        r"ARTICLE\s+\d+[\s\-:].+|"                     # juridique
+        r"CHAPITRE\s+\d+[\s\-:].+|"                    # juridique
+        r"[A-ZÀ-Ÿ][A-ZÀ-Ÿ\s]{8,}$)"                    # ALL CAPS section
+    )
+    sections = []
+    matches = list(section_re.finditer(text))
+    if matches:
+        for i, m in enumerate(matches):
+            title = m.group(0).strip()
+            start = m.end()
+            end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+            content = text[start:end].strip()
+            if title and content and len(content) > 80:
+                sections.append((title[:200], content))
+    # Si pas de sections détectées, fallback chunks fixes
+    if not sections:
+        for i in range(0, len(text), chunk_size - overlap):
+            chunk = text[i:i + chunk_size].strip()
+            if len(chunk) > 80:
+                # titre = première phrase
+                first_period = chunk.find(".")
+                title = chunk[:first_period if first_period > 20 else 80].strip()
+                sections.append((title, chunk))
+    return sections
+def load_internal_corpus(cfg: Config) -> Tuple[List[Dict[str, str]], List[str]]:
+    """Lit ./data/corpus_interne/* et génère paires + textes pour MLM."""
+    pairs = []
+    raw_texts = []
+    corpus_dir = Path(cfg.custom_corpus_dir)
+    if not corpus_dir.exists():
+        print(f"  [info] Dossier corpus interne absent : {corpus_dir}")
+        return pairs, raw_texts
+    files = []
+    for ext in ("*.pdf", "*.docx", "*.txt", "*.md", "*.html", "*.htm"):
+        files.extend(corpus_dir.rglob(ext))
+    print(f"  [+] {len(files)} fichiers internes trouvés")
+    for fp in tqdm(files, desc="corpus_interne"):
+        text = extract_text_from_file(fp)
+        if not text or len(text) < 200:
+            continue
+        raw_texts.append(text)
+        sections = chunk_document(text)
+        for title, content in sections:
+            pairs.append({
+                "anchor": title,
+                "positive": content[:2500],
+                "_internal": True,
+            })
+            # Paire bonus : "où trouver X ?" -> contenu
+            pairs.append({
+                "anchor": f"Où trouver des informations sur : {title} ?",
+                "positive": content[:2500],
+                "_internal": True,
+            })
+    return pairs, raw_texts
+# =============================================================================
+# 5. CHARGEMENT DATASETS PUBLICS (DOC GÉNÉRIQUE FR)
+# =============================================================================
+DOC_KEYWORDS = re.compile(
+    r"\b(article|chapitre|procédure|politique|règlement|directive|note de service|"
+    r"manuel|guide|formation|RH|ressources humaines|congé|absence|salaire|paie|"
+    r"contrat|CDI|CDD|convention|accord|qualité|conformité|audit|ISO|RGPD|"
+    r"comité|conseil|assemblée|direction|département|service|budget|"
+    r"facture|comptabilité|comptable|TVA|achat|vente|client|fournisseur|"
+    r"juridique|légal|loi|décret|arrêté|jurisprudence|tribunal|"
+    r"sécurité|incident|risque|santé|hygiène|formation)\b",
+    re.IGNORECASE,
+)
+def is_doc_text(t: str) -> bool:
+    return bool(DOC_KEYWORDS.search(t)) if t else False
+def load_doc_pairs(cfg: Config) -> List[Dict[str, str]]:
+    print("\n[DATA] Chargement des datasets DOC INTERNE...")
+    pairs: List[Dict[str, str]] = []
+    # 5.1 Corpus interne (priorité absolue, oversample)
+    internal_pairs, internal_texts = load_internal_corpus(cfg)
+    print(f"  [+] Corpus interne : {len(internal_pairs):,} paires brutes")
+    pairs.extend(internal_pairs * cfg.internal_oversample)
+    # 5.2 PIAF + FQuAD (paires question / contexte FR génériques)
+    try:
+        ds = load_dataset("etalab-ia/piaf", split="train")
+        for ex in tqdm(ds, desc="PIAF"):
+            q = (ex.get("question") or "").strip()
+            ctx = (ex.get("context") or "").strip()
+            if q and ctx:
+                pairs.append({"anchor": q, "positive": ctx})
+    except Exception as e:
+        print(f"  [warn] PIAF : {e}")
+    try:
+        ds = load_dataset("manu/fquad2_test", split="train")
+        for ex in tqdm(ds, desc="FQuAD2"):
+            q = (ex.get("question") or "").strip()
+            ctx = (ex.get("context") or "").strip()
+            if q and ctx:
+                pairs.append({"anchor": q, "positive": ctx})
+    except Exception as e:
+        print(f"  [warn] FQuAD2 : {e}")
+    # 5.3 mMARCO FR filtré "documentaire"
+    try:
+        ds = load_dataset("unicamp-dl/mmarco", "french", split="train")
+        ds = ds.select(range(min(500_000, len(ds))))
+        kept = 0
+        for ex in tqdm(ds, desc="mMARCO-FR (DOC-filter)"):
+            q = (ex.get("query") or "").strip()
+            p = (ex.get("positive") or ex.get("passage") or "").strip()
+            if q and p and (is_doc_text(q) or is_doc_text(p)):
+                pairs.append({"anchor": q, "positive": p})
+                kept += 1
+                if kept >= cfg.max_samples_per_dataset: break
+    except Exception as e:
+        print(f"  [warn] mMARCO : {e}")
+    # 5.4 Wikipedia FR — paires (résumé/lead -> section)
+    try:
+        ds = load_dataset("wikipedia", "20220301.fr", split="train",
+                          trust_remote_code=True)
+        ds = ds.select(range(min(100_000, len(ds))))
+        for ex in tqdm(ds, desc="Wikipedia-FR"):
+            title = (ex.get("title") or "").strip()
+            text = (ex.get("text") or "").strip()
+            if not title or not text or len(text) < 300:
+                continue
+            # Première section comme positif du titre
+            first_chunk = text[:2000]
+            pairs.append({"anchor": title, "positive": first_chunk})
+            # Sections suivantes si présentes
+            paragraphs = text.split("\n\n")
+            for para in paragraphs[1:6]:
+                if len(para) > 200:
+                    pairs.append({
+                        "anchor": f"Que dit l'article '{title}' à propos de cela ?",
+                        "positive": para[:2000],
+                    })
+    except Exception as e:
+        print(f"  [warn] Wikipedia FR : {e}")
+    # 5.5 MultiLegalPile FR (juridique)
+    try:
+        ds = load_dataset("joelniklaus/Multi_Legal_Pile", "fr_caselaw",
+                          split="train", streaming=True)
+        count = 0
+        for ex in tqdm(ds, desc="MultiLegalPile-FR", total=50_000):
+            text = (ex.get("text") or "").strip()
+            if len(text) < 500: continue
+            # Première phrase = anchor, reste = positif
+            first_period = text.find(".")
+            if 30 < first_period < 250:
+                anchor = text[:first_period + 1]
+                positive = text[first_period + 1:first_period + 2001]
+                if len(positive) > 100:
+                    pairs.append({"anchor": anchor, "positive": positive})
+            count += 1
+            if count >= 50_000: break
+    except Exception as e:
+        print(f"  [warn] MultiLegalPile : {e}")
+    # 5.6 XNLI FR (entailment)
+    try:
+        ds = load_dataset("xnli", "fr", split="train")
+        ds = ds.filter(lambda x: x["label"] == 0)
+        ds = ds.select(range(min(80_000, len(ds))))
+        for ex in tqdm(ds, desc="XNLI-FR"):
+            a = (ex.get("premise") or "").strip()
+            b = (ex.get("hypothesis") or "").strip()
+            if a and b:
+                pairs.append({"anchor": a, "positive": b})
+    except Exception as e:
+        print(f"  [warn] XNLI : {e}")
+    # 5.7 Custom JSONL
+    if Path(cfg.custom_jsonl_path).exists():
+        with open(cfg.custom_jsonl_path, "r", encoding="utf-8") as f:
+            for line in tqdm(f, desc="custom_doc.jsonl"):
+                try:
+                    ex = json.loads(line)
+                    a = (ex.get("anchor") or ex.get("query") or "").strip()
+                    p = (ex.get("positive") or ex.get("passage") or "").strip()
+                    if a and p:
+                        pairs.append({"anchor": a, "positive": p, "_internal": True})
+                except Exception:
+                    continue
+    # Dédup
+    seen = set(); uniq = []
+    for p in pairs:
+        k = (p["anchor"][:200], p["positive"][:200])
+        if k not in seen:
+            seen.add(k); uniq.append(p)
+    random.shuffle(uniq)
+    n_internal = sum(1 for p in uniq if p.get("_internal"))
+    print(f"[DATA] Total paires uniques : {len(uniq):,}  (dont interne : {n_internal:,})")
+    return uniq
+# =============================================================================
+# 6. HARD NEGATIVE MINING (2 negs par paire)
+# =============================================================================
+def mine_hard_negatives_multi(pairs, cfg: Config):
+    print(f"\n[HN] Mining {cfg.n_hard_neg} hard negatives par paire...")
+    try:
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.metrics.pairwise import linear_kernel
+    except ImportError:
+        print("  [warn] sklearn manquant"); return pairs
+    n = len(pairs)
+    pool_size = min(cfg.hard_neg_pool_size, n)
+    pool_idx = np.random.choice(n, size=pool_size, replace=False)
+    pool_pass = [pairs[i]["positive"] for i in pool_idx]
+    vec = TfidfVectorizer(max_features=80_000, ngram_range=(1, 2),
+                          lowercase=True, strip_accents="unicode")
+    X_pool = vec.fit_transform(pool_pass)
+    enriched = []
+    batch = 2000
+    anchors = [p["anchor"] for p in pairs]
+    for start in tqdm(range(0, n, batch), desc="HN-mine"):
+        end = min(start + batch, n)
+        Xq = vec.transform(anchors[start:end])
+        sims = linear_kernel(Xq, X_pool)
+        for i_loc, i_glob in enumerate(range(start, end)):
+            true_pos = pairs[i_glob]["positive"]
+            order = np.argsort(-sims[i_loc])
+            picked = []
+            for j in order[:50]:
+                cand = pool_pass[j]
+                if cand != true_pos and cand not in picked:
+                    picked.append(cand)
+                    if len(picked) >= cfg.n_hard_neg: break
+            while len(picked) < cfg.n_hard_neg:
+                picked.append(pool_pass[random.randint(0, pool_size - 1)])
+            enriched.append({
+                "anchor": pairs[i_glob]["anchor"],
+                "positive": pairs[i_glob]["positive"],
+                "hard_negs": picked,
+            })
+    return enriched
+# =============================================================================
+# 7. DATASET / COLLATE (multi-hn)
+# =============================================================================
+class PairDataset(Dataset):
+    def __init__(self, items, n_hn): self.items, self.n_hn = items, n_hn
+    def __len__(self): return len(self.items)
+    def __getitem__(self, i):
+        ex = self.items[i]
+        if self.n_hn > 0:
+            negs = ex.get("hard_negs", [ex["positive"]] * self.n_hn)
+            return ex["anchor"], ex["positive"], negs[:self.n_hn]
+        return ex["anchor"], ex["positive"]
+def make_collate_fn(tokenizer, max_len, n_hn):
+    def collate(batch):
+        a_l = [b[0] for b in batch]; p_l = [b[1] for b in batch]
+        a = tokenizer(a_l, padding=True, truncation=True,
+                      max_length=max_len, return_tensors="pt")
+        p = tokenizer(p_l, padding=True, truncation=True,
+                      max_length=max_len, return_tensors="pt")
+        if n_hn > 0:
+            # Flatten : [n0_p1, n0_p2, n1_p1, n1_p2, ...] -> on tokenize tout
+            all_negs = []
+            for b in batch:
+                all_negs.extend(b[2])  # n_hn négatifs par exemple
+            n = tokenizer(all_negs, padding=True, truncation=True,
+                          max_length=max_len, return_tensors="pt")
+            return a, p, n
+        return a, p
+    return collate
+# =============================================================================
+# 8. LOSS — Symmetric MNRL avec multi-hard-negatives
+# =============================================================================
+def symmetric_mnrl_multi_hn(emb_a, emb_p, emb_neg=None, n_hn=0, temperature=0.02):
+    """
+    emb_neg : (N * n_hn, d) si fourni, sinon None.
+    Cibles a -> [P; N1; N2; ...] : N positifs + N*n_hn négatifs durs
+    """
+    N = emb_a.size(0)
+    labels = torch.arange(N, device=emb_a.device)
+    if emb_neg is not None and n_hn > 0:
+        targets = torch.cat([emb_p, emb_neg], dim=0)
+        sim_a = emb_a @ targets.t() / temperature
+        loss_a2p = F.cross_entropy(sim_a, labels)
+    else:
+        sim_a = emb_a @ emb_p.t() / temperature
+        loss_a2p = F.cross_entropy(sim_a, labels)
+    sim_p = emb_p @ emb_a.t() / temperature
+    loss_p2a = F.cross_entropy(sim_p, labels)
+    loss = 0.5 * (loss_a2p + loss_p2a)
+    with torch.no_grad():
+        acc = (sim_a[:, :N].argmax(dim=1) == labels).float().mean().item()
+    return loss, acc
+# =============================================================================
+# 9. MLM PRÉ-ENTRAÎNEMENT (priorité corpus interne)
+# =============================================================================
+def mlm_pretrain(model, tokenizer, internal_texts, public_texts, cfg: Config):
+    # 50% interne (oversampled) + 50% public pour spécialiser sans oublier
+    if internal_texts:
+        # On répète le corpus interne pour qu'il occupe ~50% du MLM
+        target_size = max(len(public_texts), 1)
+        repeats = max(1, target_size // max(len(internal_texts), 1))
+        internal_repeated = internal_texts * repeats
+        random.shuffle(internal_repeated)
+        public_texts = public_texts[:target_size]
+        all_texts = internal_repeated[:target_size] + public_texts
+    else:
+        all_texts = public_texts
+    random.shuffle(all_texts)
+    print(f"\n[MLM] Pré-entraînement sur {len(all_texts):,} textes "
+          f"(interne : {len(internal_texts):,})")
+    class MLMDataset(Dataset):
+        def __init__(self, t): self.t = t
+        def __len__(self): return len(self.t)
+        def __getitem__(self, i): return self.t[i]
+    def mlm_collate(batch):
+        enc = tokenizer(batch, padding=True, truncation=True,
+                        max_length=cfg.max_seq_len, return_tensors="pt")
+        ids = enc["input_ids"].clone(); labels = ids.clone()
+        special = torch.zeros_like(ids, dtype=torch.bool)
+        for sid in tokenizer.all_special_ids: special |= (ids == sid)
+        prob = torch.full(ids.shape, cfg.mlm_prob)
+        prob.masked_fill_(special, 0.0)
+        masked = torch.bernoulli(prob).bool()
+        labels[~masked] = -100
+        rand = torch.rand(ids.shape)
+        ids[masked & (rand < 0.8)] = tokenizer.mask_token_id
+        rr = masked & (rand >= 0.8) & (rand < 0.9)
+        rt = torch.randint(0, tokenizer.vocab_size, ids.shape)
+        ids[rr] = rt[rr]
+        return ids, enc["attention_mask"], labels
+    loader = DataLoader(MLMDataset(all_texts), batch_size=cfg.batch_size,
+                        shuffle=True, num_workers=cfg.num_workers,
+                        collate_fn=mlm_collate, pin_memory=True,
+                        drop_last=True, persistent_workers=True)
+    optim = AdamW(model.parameters(), lr=cfg.mlm_lr, weight_decay=0.01,
+                  betas=(0.9, 0.98), eps=1e-6)
+    total_steps = len(loader) * cfg.mlm_epochs
+    sched = get_cosine_schedule_with_warmup(optim, int(total_steps * 0.04), total_steps)
+    model.train()
+    autocast_dtype = torch.bfloat16 if cfg.use_bf16 else torch.float16
+    for ep in range(cfg.mlm_epochs):
+        running = 0.0
+        pbar = tqdm(loader, desc=f"MLM ep{ep+1}/{cfg.mlm_epochs}")
+        for step, (ids, mask, labels) in enumerate(pbar, 1):
+            ids = ids.to(device, non_blocking=True)
+            mask = mask.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
+            optim.zero_grad(set_to_none=True)
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                logits = model.forward_mlm(ids, mask)
+                loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
+                                       labels.view(-1), ignore_index=-100)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optim.step(); sched.step()
+            running += loss.item()
+            if step % 50 == 0:
+                pbar.set_postfix(loss=f"{running/step:.4f}",
+                                 ppl=f"{math.exp(min(20, running/step)):.1f}")
+    print("[MLM] Terminé.\n")
+# =============================================================================
+# 10. EVAL
+# =============================================================================
+@torch.no_grad()
+def evaluate_retrieval(model, tokenizer, eval_pairs, cfg: Config):
+    model.eval()
+    autocast_dtype = torch.bfloat16 if cfg.use_bf16 else torch.float16
+    queries = [e["anchor"] for e in eval_pairs]
+    passages = [e["positive"] for e in eval_pairs]
+    def encode(texts):
+        embs = []
+        for i in range(0, len(texts), 32):
+            chunk = texts[i:i+32]
+            enc = tokenizer(chunk, padding=True, truncation=True,
+                            max_length=cfg.max_seq_len, return_tensors="pt").to(device)
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                e = model(enc["input_ids"], enc["attention_mask"])
+            embs.append(e.float())
+        return torch.cat(embs, dim=0)
+    Q = encode(queries); P = encode(passages)
+    sims = Q @ P.t()
+    N = sims.size(0)
+    targets = torch.arange(N, device=sims.device)
+    ranks = sims.argsort(dim=1, descending=True)
+    pos_in_rank = (ranks == targets.unsqueeze(1)).nonzero()[:, 1]
+    return {
+        "R@1": (pos_in_rank == 0).float().mean().item(),
+        "R@5": (pos_in_rank < 5).float().mean().item(),
+        "R@10": (pos_in_rank < 10).float().mean().item(),
+        "MRR": (1.0 / (pos_in_rank.float() + 1)).mean().item(),
+    }
+# =============================================================================
+# 11. TRAIN
+# =============================================================================
+def train():
+    tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name)
+    CFG.vocab_size = tokenizer.vocab_size
+    print(f"[TOK ] vocab_size = {CFG.vocab_size}")
+    items_all = load_doc_pairs(CFG)
+    n_eval = min(CFG.eval_max_size, max(2000, int(len(items_all) * 0.005)))
+    eval_items = items_all[:n_eval]
+    train_items = items_all[n_eval:]
+    print(f"[DATA] train={len(train_items):,}  eval={len(eval_items):,}")
+    if CFG.use_hard_negatives:
+        train_items = mine_hard_negatives_multi(train_items, CFG)
+    n_hn = CFG.n_hard_neg if CFG.use_hard_negatives else 0
+    collate = make_collate_fn(tokenizer, CFG.max_seq_len, n_hn)
+    train_loader = DataLoader(
+        PairDataset(train_items, n_hn),
+        batch_size=CFG.batch_size, shuffle=True,
+        num_workers=CFG.num_workers, collate_fn=collate,
+        pin_memory=True, drop_last=True, persistent_workers=True,
+    )
+    model = TextEncoder(CFG).to(device)
+    n_params = count_parameters(model)
+    print(f"[MODEL] Paramètres entraînables : {n_params/1e6:.2f} M")
+    if CFG.do_mlm_pretrain:
+        # Sépare textes internes vs publics
+        internal_texts = []; public_texts = []
+        for it in train_items[:500_000]:
+            if it.get("_internal"):
+                internal_texts.append(it["anchor"])
+                internal_texts.append(it["positive"])
+            else:
+                public_texts.append(it["anchor"])
+                public_texts.append(it["positive"])
+        mlm_pretrain(model, tokenizer, internal_texts, public_texts, CFG)
+    if CFG.use_compile and hasattr(torch, "compile"):
+        model = torch.compile(model, mode=CFG.compile_mode)
+    raw_model = model._orig_mod if hasattr(model, "_orig_mod") else model
+    ema = EMA(raw_model, decay=CFG.ema_decay) if CFG.use_ema else None
+    no_decay = ["bias", "LayerNorm.weight", "ln1", "ln2", "ln_f", "emb_ln",
+                "gamma1", "gamma2"]
+    grouped = [
+        {"params": [p for n, p in model.named_parameters()
+                    if "mlm_head" not in n and not any(nd in n for nd in no_decay)],
+         "weight_decay": CFG.weight_decay},
+        {"params": [p for n, p in model.named_parameters()
+                    if "mlm_head" not in n and any(nd in n for nd in no_decay)],
+         "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(grouped, lr=CFG.lr, betas=(0.9, 0.98), eps=1e-6)
+    steps_per_epoch = len(train_loader) // CFG.grad_accum_steps
+    total_steps = steps_per_epoch * CFG.epochs
+    warmup_steps = int(total_steps * CFG.warmup_ratio)
+    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
+    print(f"[OPTIM] total_steps={total_steps}  warmup={warmup_steps}")
+    autocast_dtype = torch.bfloat16 if CFG.use_bf16 else torch.float16
+    best_mrr = 0.0
+    history = []
+    for epoch in range(1, CFG.epochs + 1):
+        model.train()
+        running_loss = running_acc = 0.0
+        n_seen = 0
+        pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{CFG.epochs}")
+        optimizer.zero_grad(set_to_none=True)
+        for step, batch in enumerate(pbar, start=1):
+            if n_hn > 0:
+                a, p, neg = batch
+                neg = {k: v.to(device, non_blocking=True) for k, v in neg.items()}
+            else:
+                a, p = batch; neg = None
+            a = {k: v.to(device, non_blocking=True) for k, v in a.items()}
+            p = {k: v.to(device, non_blocking=True) for k, v in p.items()}
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                emb_a = model(a["input_ids"], a["attention_mask"])
+                emb_p = model(p["input_ids"], p["attention_mask"])
+                emb_n = (model(neg["input_ids"], neg["attention_mask"])
+                         if neg is not None else None)
+                loss, acc = symmetric_mnrl_multi_hn(
+                    emb_a, emb_p, emb_n, n_hn=n_hn, temperature=CFG.temperature)
+                loss = loss / CFG.grad_accum_steps
+            loss.backward()
+            if step % CFG.grad_accum_steps == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
+                optimizer.step(); scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+                if ema is not None: ema.update(raw_model)
+            running_loss += loss.item() * CFG.grad_accum_steps
+            running_acc += acc; n_seen += 1
+            if step % CFG.log_every == 0:
+                pbar.set_postfix(loss=f"{running_loss/n_seen:.4f}",
+                                 acc=f"{running_acc/n_seen:.3f}",
+                                 lr=f"{scheduler.get_last_lr()[0]:.2e}")
+        backup = ema.apply_to(raw_model) if ema is not None else None
+        metrics = evaluate_retrieval(model, tokenizer, eval_items, CFG)
+        if backup is not None: ema.restore(raw_model, backup)
+        print(f"\n[EVAL] epoch {epoch} : R@1={metrics['R@1']:.3f}  "
+              f"R@5={metrics['R@5']:.3f}  R@10={metrics['R@10']:.3f}  "
+              f"MRR={metrics['MRR']:.3f}")
+        history.append({"epoch": epoch, **metrics,
+                        "train_loss": running_loss / max(1, n_seen)})
+        is_best = metrics["MRR"] > best_mrr
+        if is_best: best_mrr = metrics["MRR"]
+        if ema is not None: backup = ema.apply_to(raw_model)
+        state = {k: v for k, v in raw_model.state_dict().items() if "mlm_head" not in k}
+        if epoch % CFG.save_every_epochs == 0 or is_best or epoch == CFG.epochs:
+            torch.save({"epoch": epoch, "model_state": state,
+                        "config": asdict(CFG), "metrics": metrics},
+                       Path(CFG.save_dir) / f"model_epoch{epoch}.pt")
+        if is_best:
+            torch.save({"epoch": epoch, "model_state": state,
+                        "config": asdict(CFG), "metrics": metrics},
+                       Path(CFG.save_dir) / "model_best.pt")
+        if ema is not None: ema.restore(raw_model, backup)
+        print(f"[SAVE] epoch {epoch}  best={'oui' if is_best else 'non'}")
+    with open(Path(CFG.save_dir) / "history.json", "w", encoding="utf-8") as f:
+        json.dump(history, f, ensure_ascii=False, indent=2)
+    tokenizer.save_pretrained(CFG.save_dir)
+    print(f"\n[OK] Best MRR = {best_mrr:.3f} -> {CFG.save_dir}/model_best.pt")
+# =============================================================================
+# 12. DÉMO
+# =============================================================================
+@torch.no_grad()
+def demo():
+    tokenizer = AutoTokenizer.from_pretrained(CFG.save_dir)
+    ckpt = torch.load(Path(CFG.save_dir) / "model_best.pt", map_location=device)
+    saved_cfg = ckpt["config"]
+    cfg2 = Config(**{k: v for k, v in saved_cfg.items() if hasattr(Config, k)})
+    cfg2.vocab_size = tokenizer.vocab_size
+    model = TextEncoder(cfg2).to(device).eval()
+    model.load_state_dict(ckpt["model_state"], strict=False)
+    corpus = [
+        "ARTICLE 12 - Les congés payés sont acquis à raison de 2,5 jours par mois travaillé.",
+        "Procédure de validation des notes de frais : transmettre via le portail RH avant le 5 du mois.",
+        "La politique RGPD impose un délai de 72h pour notifier une violation de données.",
+        "Le télétravail est autorisé jusqu'à 3 jours par semaine sur accord du manager.",
+        "Toute facture fournisseur doit être validée par le responsable budget avant paiement.",
+        "Formation obligatoire sécurité incendie : 1 fois par an, traçabilité dans le SIRH.",
+        "L'accord d'entreprise du 15/03/2024 fixe le taux de prime annuelle à 8% du salaire brut.",
+    ]
+    queries = [
+        "Combien de jours de congés je gagne par mois ?",
+        "Comment déclarer mes notes de frais ?",
+        "Quel est le quota de télétravail ?",
+        "Quel taux de prime annuelle ?",
+    ]
+    enc = tokenizer(corpus, padding=True, truncation=True,
+                    max_length=cfg2.max_seq_len, return_tensors="pt").to(device)
+    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        c_emb = model(enc["input_ids"], enc["attention_mask"])
+    print("\n[DEMO DOC-INTERNE-100M]")
+    for q in queries:
+        eq = tokenizer([q], padding=True, truncation=True,
+                       max_length=cfg2.max_seq_len, return_tensors="pt").to(device)
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            q_emb = model(eq["input_ids"], eq["attention_mask"])
+        sims = (q_emb @ c_emb.t()).squeeze(0)
+        top = sims.topk(3)
+        print(f"\nQ : {q}")
+        for s, i in zip(top.values, top.indices):
+            print(f"  ({s.item():.3f}) -> {corpus[i.item()]}")
+if __name__ == "__main__":
+    train()
+    try:
+        demo()
+    except Exception as e:
+        print(f"[demo] {e}")

modeleAIRAG/train3_200m.py ADDED Viewed

	@@ -0,0 +1,922 @@

+"""
+==============================================================================
+  RAG/NLP encoder ~100M params - SPÉCIALISÉ DOCUMENTAIRE INTERNE ENTREPRISE
+  (RH, juridique, procédures, comptabilité, qualité, conformité, formation)
+  Hardware : NVIDIA H100 80GB
+  Epochs   : 20
+==============================================================================
+Spécificités vs version IT :
+  - max_seq_len = 384 (documents internes longs : procédures, contrats)
+  - Filtres lexicaux orientés "entreprise / documentation"
+  - Datasets : Common Crawl FR (filtré), Wikipédia FR (catégories doc),
+              FQuAD/PIAF (questions admin/juridique), MultiLegalPile-FR,
+              corpus interne JSONL (priorité absolue)
+  - Augmentation : "title -> contenu" et "section -> paragraphe"
+  - Loss : MNRL symétrique + 2 hard negatives par paire
+  - Pré-entraînement MLM sur corpus interne en priorité
+  - EMA decay 0.9995, LayerScale, BF16, SDPA, Gradient Checkpointing
+  - 20 epochs, batch effectif 384
+Architecture identique 100M params (12L, 768d, 12H, FFN=3072).
+Usage :
+  pip install torch>=2.2 transformers>=4.40 datasets>=2.18 accelerate \\
+              sentencepiece tqdm numpy scikit-learn faiss-cpu beautifulsoup4
+  python train_rag_doc_interne_100m.py
+Préparation du corpus interne :
+  Place tes documents dans ./data/corpus_interne/ (PDF/DOCX/TXT/MD)
+  Ou directement un JSONL ./data/custom_doc.jsonl avec {"anchor","positive"}
+"""
+import os
+import math
+import json
+import random
+import re
+import glob
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as gc
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from transformers import AutoTokenizer, get_cosine_schedule_with_warmup
+from datasets import load_dataset, Dataset as HFDataset
+from tqdm.auto import tqdm
+# =============================================================================
+# 1. CONFIG — 100M, Documentaire interne
+# =============================================================================
+@dataclass
+class Config:
+    # --- Modèle ~100M ---
+    vocab_size: int = 32000
+    hidden_size: int = 1024
+    num_hidden_layers: int = 16
+    num_attention_heads: int = 16
+    intermediate_size: int = 4096
+    max_position_embeddings: int = 512      # docs longs
+    hidden_dropout_prob: float = 0.1
+    attention_probs_dropout_prob: float = 0.1
+    layer_norm_eps: float = 1e-12
+    embedding_dim: int = 1024
+    use_layer_scale: bool = True
+    layer_scale_init: float = 1e-5
+    use_grad_checkpointing: bool = True
+    tokenizer_name: str = "camembert-base"
+    # --- MLM (priorité corpus interne) ---
+    do_mlm_pretrain: bool = True
+    mlm_epochs: int = 2                     # +1 vs IT, doc interne plus rare
+    mlm_prob: float = 0.15
+    mlm_lr: float = 8e-5
+    # --- Contrastif ---
+    epochs: int = 12
+    batch_size: int = 32                    # seq_len 384 -> batch + petit
+    grad_accum_steps: int = 12               # effectif = 384
+    max_seq_len: int = 384
+    lr: float = 1.5e-5
+    weight_decay: float = 0.01
+    warmup_ratio: float = 0.06
+    grad_clip: float = 1.0
+    temperature: float = 0.02
+    num_workers: int = 6
+    seed: int = 42
+    # --- Hard negatives (2 par paire pour doc interne) ---
+    use_hard_negatives: bool = True
+    n_hard_neg: int = 2                     # plus fort
+    hard_neg_pool_size: int = 200_000
+    use_ema: bool = True
+    ema_decay: float = 0.9995
+    max_samples_per_dataset: int = 250_000
+    eval_max_size: int = 5_000
+    use_bf16: bool = True
+    use_compile: bool = True
+    compile_mode: str = "default"
+    log_every: int = 50
+    save_dir: str = "./checkpoints_rag_doc_200m"
+    save_every_epochs: int = 2
+    # --- Corpus interne ---
+    custom_jsonl_path: str = "./data/custom_doc.jsonl"
+    custom_corpus_dir: str = "./data/corpus_interne"  # PDF/DOCX/TXT/MD
+    internal_oversample: int = 8            # x5 pour booster apprentissage interne
+CFG = Config()
+Path(CFG.save_dir).mkdir(parents=True, exist_ok=True)
+random.seed(CFG.seed); np.random.seed(CFG.seed)
+torch.manual_seed(CFG.seed); torch.cuda.manual_seed_all(CFG.seed)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision("high")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"[INFO] Device : {device}")
+if torch.cuda.is_available():
+    print(f"[INFO] GPU    : {torch.cuda.get_device_name(0)}")
+    print(f"[INFO] VRAM   : {torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB")
+# =============================================================================
+# 2. ARCHITECTURE
+# =============================================================================
+class TransformerEncoderBlock(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.num_heads = cfg.num_attention_heads
+        self.head_dim = cfg.hidden_size // cfg.num_attention_heads
+        self.ln1 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.qkv = nn.Linear(cfg.hidden_size, 3 * cfg.hidden_size, bias=True)
+        self.proj = nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)
+        self.attn_drop_p = cfg.attention_probs_dropout_prob
+        self.ln2 = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.intermediate_size),
+            nn.GELU(),
+            nn.Linear(cfg.intermediate_size, cfg.hidden_size),
+            nn.Dropout(cfg.hidden_dropout_prob),
+        )
+        self.resid_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.use_ls = cfg.use_layer_scale
+        if cfg.use_layer_scale:
+            self.gamma1 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+            self.gamma2 = nn.Parameter(cfg.layer_scale_init * torch.ones(cfg.hidden_size))
+    def forward(self, x, attn_mask):
+        B, T, C = x.shape
+        h = self.ln1(x)
+        qkv = self.qkv(h).view(B, T, 3, self.num_heads, self.head_dim)
+        q, k, v = qkv.permute(2, 0, 3, 1, 4)
+        kpm = attn_mask[:, None, None, :].bool()
+        a = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=kpm,
+            dropout_p=self.attn_drop_p if self.training else 0.0,
+            is_causal=False)
+        a = a.transpose(1, 2).contiguous().view(B, T, C)
+        a = self.resid_drop(self.proj(a))
+        if self.use_ls: a = a * self.gamma1
+        x = x + a
+        m = self.mlp(self.ln2(x))
+        if self.use_ls: m = m * self.gamma2
+        return x + m
+class TextEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.hidden_size, padding_idx=0)
+        self.pos_emb = nn.Embedding(cfg.max_position_embeddings, cfg.hidden_size)
+        self.emb_ln = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.emb_drop = nn.Dropout(cfg.hidden_dropout_prob)
+        self.blocks = nn.ModuleList([TransformerEncoderBlock(cfg)
+                                     for _ in range(cfg.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps)
+        self.proj_head = nn.Sequential(
+            nn.Linear(cfg.hidden_size, cfg.hidden_size),
+            nn.Tanh(),
+            nn.Linear(cfg.hidden_size, cfg.embedding_dim),
+        )
+        self.mlm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
+        self.mlm_head.weight = self.tok_emb.weight
+        self.use_gc = cfg.use_grad_checkpointing
+        self.apply(self._init_weights)
+    @staticmethod
+    def _init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.02)
+            if m.bias is not None: nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, std=0.02)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.ones_(m.weight); nn.init.zeros_(m.bias)
+    def encode_backbone(self, ids, mask):
+        B, T = ids.shape
+        pos = torch.arange(T, device=ids.device).unsqueeze(0).expand(B, T)
+        x = self.tok_emb(ids) + self.pos_emb(pos)
+        x = self.emb_drop(self.emb_ln(x))
+        for blk in self.blocks:
+            if self.use_gc and self.training:
+                x = gc.checkpoint(blk, x, mask, use_reentrant=False)
+            else:
+                x = blk(x, mask)
+        return self.ln_f(x)
+    def forward(self, ids, mask):
+        x = self.encode_backbone(ids, mask)
+        m = mask.unsqueeze(-1).float()
+        pooled = (x * m).sum(dim=1) / m.sum(dim=1).clamp(min=1e-6)
+        emb = self.proj_head(pooled)
+        return F.normalize(emb, p=2, dim=-1)
+    def forward_mlm(self, ids, mask):
+        return self.mlm_head(self.encode_backbone(ids, mask))
+def count_parameters(model):
+    return sum(p.numel() for n, p in model.named_parameters()
+               if p.requires_grad and "mlm_head" not in n)
+# =============================================================================
+# 3. EMA
+# =============================================================================
+class EMA:
+    def __init__(self, model, decay=0.999):
+        self.decay = decay
+        self.shadow = {n: p.detach().clone()
+                       for n, p in model.named_parameters() if p.requires_grad}
+    @torch.no_grad()
+    def update(self, model):
+        for n, p in model.named_parameters():
+            if p.requires_grad and n in self.shadow:
+                self.shadow[n].mul_(self.decay).add_(p.detach(), alpha=1.0 - self.decay)
+    @torch.no_grad()
+    def apply_to(self, model):
+        backup = {}
+        for n, p in model.named_parameters():
+            if n in self.shadow:
+                backup[n] = p.detach().clone(); p.copy_(self.shadow[n])
+        return backup
+    @torch.no_grad()
+    def restore(self, model, backup):
+        for n, p in model.named_parameters():
+            if n in backup: p.copy_(backup[n])
+# =============================================================================
+# 4. EXTRACTION CORPUS INTERNE (PDF / DOCX / TXT / MD)
+# =============================================================================
+def extract_text_from_file(path: Path) -> str:
+    """Extracteur multi-format. Retourne texte brut ou ''."""
+    suffix = path.suffix.lower()
+    try:
+        if suffix in {".txt", ".md"}:
+            return path.read_text(encoding="utf-8", errors="ignore")
+        if suffix == ".pdf":
+            try:
+                from pypdf import PdfReader
+            except ImportError:
+                from PyPDF2 import PdfReader
+            reader = PdfReader(str(path))
+            return "\n".join((p.extract_text() or "") for p in reader.pages)
+        if suffix == ".docx":
+            from docx import Document
+            doc = Document(str(path))
+            return "\n".join(p.text for p in doc.paragraphs)
+        if suffix in {".html", ".htm"}:
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(path.read_text(encoding="utf-8", errors="ignore"),
+                                 "html.parser")
+            return soup.get_text(separator="\n")
+    except Exception as e:
+        print(f"  [warn] extract {path.name} : {e}")
+    return ""
+def chunk_document(text: str, chunk_size: int = 1500,
+                   overlap: int = 200) -> List[Tuple[str, str]]:
+    """
+    Découpe un document en (titre/section, contenu) pour générer des paires.
+    Utilise les titres Markdown / numérotation pour détecter les sections.
+    """
+    text = re.sub(r"\n{3,}", "\n\n", text).strip()
+    if not text:
+        return []
+    # Détection sections (Markdown ##, numérotation 1., 1.1, ARTICLE, etc.)
+    section_re = re.compile(
+        r"(?m)^(#{1,4}\s+.+|"                          # markdown
+        r"\d+(?:\.\d+)*\.?\s+[A-ZÀ-Ÿa-zà-ÿ].+|"        # numérotation
+        r"ARTICLE\s+\d+[\s\-:].+|"                     # juridique
+        r"CHAPITRE\s+\d+[\s\-:].+|"                    # juridique
+        r"[A-ZÀ-Ÿ][A-ZÀ-Ÿ\s]{8,}$)"                    # ALL CAPS section
+    )
+    sections = []
+    matches = list(section_re.finditer(text))
+    if matches:
+        for i, m in enumerate(matches):
+            title = m.group(0).strip()
+            start = m.end()
+            end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+            content = text[start:end].strip()
+            if title and content and len(content) > 80:
+                sections.append((title[:200], content))
+    # Si pas de sections détectées, fallback chunks fixes
+    if not sections:
+        for i in range(0, len(text), chunk_size - overlap):
+            chunk = text[i:i + chunk_size].strip()
+            if len(chunk) > 80:
+                # titre = première phrase
+                first_period = chunk.find(".")
+                title = chunk[:first_period if first_period > 20 else 80].strip()
+                sections.append((title, chunk))
+    return sections
+def load_internal_corpus(cfg: Config) -> Tuple[List[Dict[str, str]], List[str]]:
+    """Lit ./data/corpus_interne/* et génère paires + textes pour MLM."""
+    pairs = []
+    raw_texts = []
+    corpus_dir = Path(cfg.custom_corpus_dir)
+    if not corpus_dir.exists():
+        print(f"  [info] Dossier corpus interne absent : {corpus_dir}")
+        return pairs, raw_texts
+    files = []
+    for ext in ("*.pdf", "*.docx", "*.txt", "*.md", "*.html", "*.htm"):
+        files.extend(corpus_dir.rglob(ext))
+    print(f"  [+] {len(files)} fichiers internes trouvés")
+    for fp in tqdm(files, desc="corpus_interne"):
+        text = extract_text_from_file(fp)
+        if not text or len(text) < 200:
+            continue
+        raw_texts.append(text)
+        sections = chunk_document(text)
+        for title, content in sections:
+            pairs.append({
+                "anchor": title,
+                "positive": content[:2500],
+                "_internal": True,
+            })
+            # Paire bonus : "où trouver X ?" -> contenu
+            pairs.append({
+                "anchor": f"Où trouver des informations sur : {title} ?",
+                "positive": content[:2500],
+                "_internal": True,
+            })
+    return pairs, raw_texts
+# =============================================================================
+# 5. CHARGEMENT DATASETS PUBLICS (DOC GÉNÉRIQUE FR)
+# =============================================================================
+DOC_KEYWORDS = re.compile(
+    r"\b(article|chapitre|procédure|politique|règlement|directive|note de service|"
+    r"manuel|guide|formation|RH|ressources humaines|congé|absence|salaire|paie|"
+    r"contrat|CDI|CDD|convention|accord|qualité|conformité|audit|ISO|RGPD|"
+    r"comité|conseil|assemblée|direction|département|service|budget|"
+    r"facture|comptabilité|comptable|TVA|achat|vente|client|fournisseur|"
+    r"juridique|légal|loi|décret|arrêté|jurisprudence|tribunal|"
+    r"sécurité|incident|risque|santé|hygiène|formation)\b",
+    re.IGNORECASE,
+)
+def is_doc_text(t: str) -> bool:
+    return bool(DOC_KEYWORDS.search(t)) if t else False
+def load_doc_pairs(cfg: Config) -> List[Dict[str, str]]:
+    print("\n[DATA] Chargement des datasets DOC INTERNE...")
+    pairs: List[Dict[str, str]] = []
+    # 5.1 Corpus interne (priorité absolue, oversample)
+    internal_pairs, internal_texts = load_internal_corpus(cfg)
+    print(f"  [+] Corpus interne : {len(internal_pairs):,} paires brutes")
+    pairs.extend(internal_pairs * cfg.internal_oversample)
+    # 5.2 PIAF + FQuAD (paires question / contexte FR génériques)
+    try:
+        ds = load_dataset("etalab-ia/piaf", split="train")
+        for ex in tqdm(ds, desc="PIAF"):
+            q = (ex.get("question") or "").strip()
+            ctx = (ex.get("context") or "").strip()
+            if q and ctx:
+                pairs.append({"anchor": q, "positive": ctx})
+    except Exception as e:
+        print(f"  [warn] PIAF : {e}")
+    try:
+        ds = load_dataset("manu/fquad2_test", split="train")
+        for ex in tqdm(ds, desc="FQuAD2"):
+            q = (ex.get("question") or "").strip()
+            ctx = (ex.get("context") or "").strip()
+            if q and ctx:
+                pairs.append({"anchor": q, "positive": ctx})
+    except Exception as e:
+        print(f"  [warn] FQuAD2 : {e}")
+    # 5.3 mMARCO FR filtré "documentaire"
+    try:
+        ds = load_dataset("unicamp-dl/mmarco", "french", split="train")
+        ds = ds.select(range(min(500_000, len(ds))))
+        kept = 0
+        for ex in tqdm(ds, desc="mMARCO-FR (DOC-filter)"):
+            q = (ex.get("query") or "").strip()
+            p = (ex.get("positive") or ex.get("passage") or "").strip()
+            if q and p and (is_doc_text(q) or is_doc_text(p)):
+                pairs.append({"anchor": q, "positive": p})
+                kept += 1
+                if kept >= cfg.max_samples_per_dataset: break
+    except Exception as e:
+        print(f"  [warn] mMARCO : {e}")
+    # 5.4 Wikipedia FR — paires (résumé/lead -> section)
+    try:
+        ds = load_dataset("wikipedia", "20220301.fr", split="train",
+                          trust_remote_code=True)
+        ds = ds.select(range(min(100_000, len(ds))))
+        for ex in tqdm(ds, desc="Wikipedia-FR"):
+            title = (ex.get("title") or "").strip()
+            text = (ex.get("text") or "").strip()
+            if not title or not text or len(text) < 300:
+                continue
+            # Première section comme positif du titre
+            first_chunk = text[:2000]
+            pairs.append({"anchor": title, "positive": first_chunk})
+            # Sections suivantes si présentes
+            paragraphs = text.split("\n\n")
+            for para in paragraphs[1:6]:
+                if len(para) > 200:
+                    pairs.append({
+                        "anchor": f"Que dit l'article '{title}' à propos de cela ?",
+                        "positive": para[:2000],
+                    })
+    except Exception as e:
+        print(f"  [warn] Wikipedia FR : {e}")
+    # 5.5 MultiLegalPile FR (juridique)
+    try:
+        ds = load_dataset("joelniklaus/Multi_Legal_Pile", "fr_caselaw",
+                          split="train", streaming=True)
+        count = 0
+        for ex in tqdm(ds, desc="MultiLegalPile-FR", total=50_000):
+            text = (ex.get("text") or "").strip()
+            if len(text) < 500: continue
+            # Première phrase = anchor, reste = positif
+            first_period = text.find(".")
+            if 30 < first_period < 250:
+                anchor = text[:first_period + 1]
+                positive = text[first_period + 1:first_period + 2001]
+                if len(positive) > 100:
+                    pairs.append({"anchor": anchor, "positive": positive})
+            count += 1
+            if count >= 50_000: break
+    except Exception as e:
+        print(f"  [warn] MultiLegalPile : {e}")
+    # 5.6 XNLI FR (entailment)
+    try:
+        ds = load_dataset("xnli", "fr", split="train")
+        ds = ds.filter(lambda x: x["label"] == 0)
+        ds = ds.select(range(min(80_000, len(ds))))
+        for ex in tqdm(ds, desc="XNLI-FR"):
+            a = (ex.get("premise") or "").strip()
+            b = (ex.get("hypothesis") or "").strip()
+            if a and b:
+                pairs.append({"anchor": a, "positive": b})
+    except Exception as e:
+        print(f"  [warn] XNLI : {e}")
+    # 5.7 Custom JSONL
+    if Path(cfg.custom_jsonl_path).exists():
+        with open(cfg.custom_jsonl_path, "r", encoding="utf-8") as f:
+            for line in tqdm(f, desc="custom_doc.jsonl"):
+                try:
+                    ex = json.loads(line)
+                    a = (ex.get("anchor") or ex.get("query") or "").strip()
+                    p = (ex.get("positive") or ex.get("passage") or "").strip()
+                    if a and p:
+                        pairs.append({"anchor": a, "positive": p, "_internal": True})
+                except Exception:
+                    continue
+    # Dédup
+    seen = set(); uniq = []
+    for p in pairs:
+        k = (p["anchor"][:200], p["positive"][:200])
+        if k not in seen:
+            seen.add(k); uniq.append(p)
+    random.shuffle(uniq)
+    n_internal = sum(1 for p in uniq if p.get("_internal"))
+    print(f"[DATA] Total paires uniques : {len(uniq):,}  (dont interne : {n_internal:,})")
+    return uniq
+# =============================================================================
+# 6. HARD NEGATIVE MINING (2 negs par paire)
+# =============================================================================
+def mine_hard_negatives_multi(pairs, cfg: Config):
+    print(f"\n[HN] Mining {cfg.n_hard_neg} hard negatives par paire...")
+    try:
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        from sklearn.metrics.pairwise import linear_kernel
+    except ImportError:
+        print("  [warn] sklearn manquant"); return pairs
+    n = len(pairs)
+    pool_size = min(cfg.hard_neg_pool_size, n)
+    pool_idx = np.random.choice(n, size=pool_size, replace=False)
+    pool_pass = [pairs[i]["positive"] for i in pool_idx]
+    vec = TfidfVectorizer(max_features=80_000, ngram_range=(1, 2),
+                          lowercase=True, strip_accents="unicode")
+    X_pool = vec.fit_transform(pool_pass)
+    enriched = []
+    batch = 2000
+    anchors = [p["anchor"] for p in pairs]
+    for start in tqdm(range(0, n, batch), desc="HN-mine"):
+        end = min(start + batch, n)
+        Xq = vec.transform(anchors[start:end])
+        sims = linear_kernel(Xq, X_pool)
+        for i_loc, i_glob in enumerate(range(start, end)):
+            true_pos = pairs[i_glob]["positive"]
+            order = np.argsort(-sims[i_loc])
+            picked = []
+            for j in order[:50]:
+                cand = pool_pass[j]
+                if cand != true_pos and cand not in picked:
+                    picked.append(cand)
+                    if len(picked) >= cfg.n_hard_neg: break
+            while len(picked) < cfg.n_hard_neg:
+                picked.append(pool_pass[random.randint(0, pool_size - 1)])
+            enriched.append({
+                "anchor": pairs[i_glob]["anchor"],
+                "positive": pairs[i_glob]["positive"],
+                "hard_negs": picked,
+                "_internal": pairs[i_glob].get("_internal", False),
+            })
+    return enriched
+# =============================================================================
+# 7. DATASET / COLLATE (multi-hn)
+# =============================================================================
+class PairDataset(Dataset):
+    def __init__(self, items, n_hn): self.items, self.n_hn = items, n_hn
+    def __len__(self): return len(self.items)
+    def __getitem__(self, i):
+        ex = self.items[i]
+        if self.n_hn > 0:
+            negs = ex.get("hard_negs", [ex["positive"]] * self.n_hn)
+            return ex["anchor"], ex["positive"], negs[:self.n_hn]
+        return ex["anchor"], ex["positive"]
+def make_collate_fn(tokenizer, max_len, n_hn):
+    def collate(batch):
+        a_l = [b[0] for b in batch]; p_l = [b[1] for b in batch]
+        a = tokenizer(a_l, padding=True, truncation=True,
+                      max_length=max_len, return_tensors="pt")
+        p = tokenizer(p_l, padding=True, truncation=True,
+                      max_length=max_len, return_tensors="pt")
+        if n_hn > 0:
+            # Flatten : [n0_p1, n0_p2, n1_p1, n1_p2, ...] -> on tokenize tout
+            all_negs = []
+            for b in batch:
+                all_negs.extend(b[2])  # n_hn négatifs par exemple
+            n = tokenizer(all_negs, padding=True, truncation=True,
+                          max_length=max_len, return_tensors="pt")
+            return a, p, n
+        return a, p
+    return collate
+# =============================================================================
+# 8. LOSS — Symmetric MNRL avec multi-hard-negatives
+# =============================================================================
+def symmetric_mnrl_multi_hn(emb_a, emb_p, emb_neg=None, n_hn=0, temperature=0.02):
+    """
+    emb_neg : (N * n_hn, d) si fourni, sinon None.
+    Cibles a -> [P; N1; N2; ...] : N positifs + N*n_hn négatifs durs
+    """
+    N = emb_a.size(0)
+    labels = torch.arange(N, device=emb_a.device)
+    if emb_neg is not None and n_hn > 0:
+        targets = torch.cat([emb_p, emb_neg], dim=0)
+        sim_a = emb_a @ targets.t() / temperature
+        loss_a2p = F.cross_entropy(sim_a, labels)
+    else:
+        sim_a = emb_a @ emb_p.t() / temperature
+        loss_a2p = F.cross_entropy(sim_a, labels)
+    sim_p = emb_p @ emb_a.t() / temperature
+    loss_p2a = F.cross_entropy(sim_p, labels)
+    loss = 0.5 * (loss_a2p + loss_p2a)
+    with torch.no_grad():
+        acc = (sim_a[:, :N].argmax(dim=1) == labels).float().mean().item()
+    return loss, acc
+# =============================================================================
+# 9. MLM PRÉ-ENTRAÎNEMENT (priorité corpus interne)
+# =============================================================================
+def mlm_pretrain(model, tokenizer, internal_texts, public_texts, cfg: Config):
+    # 50% interne (oversampled) + 50% public pour spécialiser sans oublier
+    if internal_texts:
+        # On répète le corpus interne pour qu'il occupe ~50% du MLM
+        target_size = max(len(public_texts), 1)
+        repeats = max(1, target_size // max(len(internal_texts), 1))
+        internal_repeated = internal_texts * repeats
+        random.shuffle(internal_repeated)
+        public_texts = public_texts[:target_size]
+        all_texts = internal_repeated[:target_size] + public_texts
+    else:
+        all_texts = public_texts
+    random.shuffle(all_texts)
+    print(f"\n[MLM] Pré-entraînement sur {len(all_texts):,} textes "
+          f"(interne : {len(internal_texts):,})")
+    class MLMDataset(Dataset):
+        def __init__(self, t): self.t = t
+        def __len__(self): return len(self.t)
+        def __getitem__(self, i): return self.t[i]
+    def mlm_collate(batch):
+        enc = tokenizer(batch, padding=True, truncation=True,
+                        max_length=cfg.max_seq_len, return_tensors="pt")
+        ids = enc["input_ids"].clone(); labels = ids.clone()
+        special = torch.zeros_like(ids, dtype=torch.bool)
+        for sid in tokenizer.all_special_ids: special |= (ids == sid)
+        prob = torch.full(ids.shape, cfg.mlm_prob)
+        prob.masked_fill_(special, 0.0)
+        masked = torch.bernoulli(prob).bool()
+        labels[~masked] = -100
+        rand = torch.rand(ids.shape)
+        ids[masked & (rand < 0.8)] = tokenizer.mask_token_id
+        rr = masked & (rand >= 0.8) & (rand < 0.9)
+        rt = torch.randint(0, tokenizer.vocab_size, ids.shape)
+        ids[rr] = rt[rr]
+        return ids, enc["attention_mask"], labels
+    loader = DataLoader(MLMDataset(all_texts), batch_size=cfg.batch_size,
+                        shuffle=True, num_workers=cfg.num_workers,
+                        collate_fn=mlm_collate, pin_memory=True,
+                        drop_last=True, persistent_workers=True)
+    optim = AdamW(model.parameters(), lr=cfg.mlm_lr, weight_decay=0.01,
+                  betas=(0.9, 0.98), eps=1e-6)
+    total_steps = len(loader) * cfg.mlm_epochs
+    sched = get_cosine_schedule_with_warmup(optim, int(total_steps * 0.04), total_steps)
+    model.train()
+    autocast_dtype = torch.bfloat16 if cfg.use_bf16 else torch.float16
+    for ep in range(cfg.mlm_epochs):
+        running = 0.0
+        pbar = tqdm(loader, desc=f"MLM ep{ep+1}/{cfg.mlm_epochs}")
+        for step, (ids, mask, labels) in enumerate(pbar, 1):
+            ids = ids.to(device, non_blocking=True)
+            mask = mask.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
+            optim.zero_grad(set_to_none=True)
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                logits = model.forward_mlm(ids, mask)
+                loss = F.cross_entropy(logits.view(-1, logits.size(-1)),
+                                       labels.view(-1), ignore_index=-100)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optim.step(); sched.step()
+            running += loss.item()
+            if step % 50 == 0:
+                pbar.set_postfix(loss=f"{running/step:.4f}",
+                                 ppl=f"{math.exp(min(20, running/step)):.1f}")
+    print("[MLM] Terminé.\n")
+# =============================================================================
+# 10. EVAL
+# =============================================================================
+@torch.no_grad()
+def evaluate_retrieval(model, tokenizer, eval_pairs, cfg: Config):
+    model.eval()
+    autocast_dtype = torch.bfloat16 if cfg.use_bf16 else torch.float16
+    queries = [e["anchor"] for e in eval_pairs]
+    passages = [e["positive"] for e in eval_pairs]
+    def encode(texts):
+        embs = []
+        for i in range(0, len(texts), 32):
+            chunk = texts[i:i+32]
+            enc = tokenizer(chunk, padding=True, truncation=True,
+                            max_length=cfg.max_seq_len, return_tensors="pt").to(device)
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                e = model(enc["input_ids"], enc["attention_mask"])
+            embs.append(e.float())
+        return torch.cat(embs, dim=0)
+    Q = encode(queries); P = encode(passages)
+    sims = Q @ P.t()
+    N = sims.size(0)
+    targets = torch.arange(N, device=sims.device)
+    ranks = sims.argsort(dim=1, descending=True)
+    pos_in_rank = (ranks == targets.unsqueeze(1)).nonzero()[:, 1]
+    return {
+        "R@1": (pos_in_rank == 0).float().mean().item(),
+        "R@5": (pos_in_rank < 5).float().mean().item(),
+        "R@10": (pos_in_rank < 10).float().mean().item(),
+        "MRR": (1.0 / (pos_in_rank.float() + 1)).mean().item(),
+    }
+# =============================================================================
+# 11. TRAIN
+# =============================================================================
+def train():
+    tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name)
+    CFG.vocab_size = tokenizer.vocab_size
+    print(f"[TOK ] vocab_size = {CFG.vocab_size}")
+    items_all = load_doc_pairs(CFG)
+    n_eval = min(CFG.eval_max_size, max(2000, int(len(items_all) * 0.005)))
+    eval_items = items_all[:n_eval]
+    train_items = items_all[n_eval:]
+    print(f"[DATA] train={len(train_items):,}  eval={len(eval_items):,}")
+    if CFG.use_hard_negatives:
+        train_items = mine_hard_negatives_multi(train_items, CFG)
+    n_hn = CFG.n_hard_neg if CFG.use_hard_negatives else 0
+    collate = make_collate_fn(tokenizer, CFG.max_seq_len, n_hn)
+    train_loader = DataLoader(
+        PairDataset(train_items, n_hn),
+        batch_size=CFG.batch_size, shuffle=True,
+        num_workers=CFG.num_workers, collate_fn=collate,
+        pin_memory=True, drop_last=True, persistent_workers=True,
+    )
+    model = TextEncoder(CFG).to(device)
+    n_params = count_parameters(model)
+    print(f"[MODEL] Paramètres entraînables : {n_params/1e6:.2f} M")
+    if CFG.do_mlm_pretrain:
+        # Sépare textes internes vs publics
+        internal_texts = []; public_texts = []
+        for it in train_items[:500_000]:
+            if it.get("_internal"):
+                internal_texts.append(it["anchor"])
+                internal_texts.append(it["positive"])
+            else:
+                public_texts.append(it["anchor"])
+                public_texts.append(it["positive"])
+        mlm_pretrain(model, tokenizer, internal_texts, public_texts, CFG)
+    if CFG.use_compile and hasattr(torch, "compile"):
+        model = torch.compile(model, mode=CFG.compile_mode)
+    raw_model = model._orig_mod if hasattr(model, "_orig_mod") else model
+    ema = EMA(raw_model, decay=CFG.ema_decay) if CFG.use_ema else None
+    no_decay = ["bias", "LayerNorm.weight", "ln1", "ln2", "ln_f", "emb_ln",
+                "gamma1", "gamma2"]
+    grouped = [
+        {"params": [p for n, p in model.named_parameters()
+                    if "mlm_head" not in n and not any(nd in n for nd in no_decay)],
+         "weight_decay": CFG.weight_decay},
+        {"params": [p for n, p in model.named_parameters()
+                    if "mlm_head" not in n and any(nd in n for nd in no_decay)],
+         "weight_decay": 0.0},
+    ]
+    optimizer = AdamW(grouped, lr=CFG.lr, betas=(0.9, 0.98), eps=1e-6)
+    steps_per_epoch = len(train_loader) // CFG.grad_accum_steps
+    total_steps = steps_per_epoch * CFG.epochs
+    warmup_steps = int(total_steps * CFG.warmup_ratio)
+    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
+    print(f"[OPTIM] total_steps={total_steps}  warmup={warmup_steps}")
+    autocast_dtype = torch.bfloat16 if CFG.use_bf16 else torch.float16
+    best_mrr = 0.0
+    history = []
+    for epoch in range(1, CFG.epochs + 1):
+        model.train()
+        running_loss = running_acc = 0.0
+        n_seen = 0
+        pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{CFG.epochs}")
+        optimizer.zero_grad(set_to_none=True)
+        for step, batch in enumerate(pbar, start=1):
+            if n_hn > 0:
+                a, p, neg = batch
+                neg = {k: v.to(device, non_blocking=True) for k, v in neg.items()}
+            else:
+                a, p = batch; neg = None
+            a = {k: v.to(device, non_blocking=True) for k, v in a.items()}
+            p = {k: v.to(device, non_blocking=True) for k, v in p.items()}
+            with torch.autocast(device_type="cuda", dtype=autocast_dtype):
+                emb_a = model(a["input_ids"], a["attention_mask"])
+                emb_p = model(p["input_ids"], p["attention_mask"])
+                emb_n = (model(neg["input_ids"], neg["attention_mask"])
+                         if neg is not None else None)
+                loss, acc = symmetric_mnrl_multi_hn(
+                    emb_a, emb_p, emb_n, n_hn=n_hn, temperature=CFG.temperature)
+                loss = loss / CFG.grad_accum_steps
+            loss.backward()
+            if step % CFG.grad_accum_steps == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.grad_clip)
+                optimizer.step(); scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+                if ema is not None: ema.update(raw_model)
+            running_loss += loss.item() * CFG.grad_accum_steps
+            running_acc += acc; n_seen += 1
+            if step % CFG.log_every == 0:
+                pbar.set_postfix(loss=f"{running_loss/n_seen:.4f}",
+                                 acc=f"{running_acc/n_seen:.3f}",
+                                 lr=f"{scheduler.get_last_lr()[0]:.2e}")
+        backup = ema.apply_to(raw_model) if ema is not None else None
+        metrics = evaluate_retrieval(model, tokenizer, eval_items, CFG)
+        if backup is not None: ema.restore(raw_model, backup)
+        print(f"\n[EVAL] epoch {epoch} : R@1={metrics['R@1']:.3f}  "
+              f"R@5={metrics['R@5']:.3f}  R@10={metrics['R@10']:.3f}  "
+              f"MRR={metrics['MRR']:.3f}")
+        history.append({"epoch": epoch, **metrics,
+                        "train_loss": running_loss / max(1, n_seen)})
+        is_best = metrics["MRR"] > best_mrr
+        if is_best: best_mrr = metrics["MRR"]
+        if ema is not None: backup = ema.apply_to(raw_model)
+        state = {k: v for k, v in raw_model.state_dict().items() if "mlm_head" not in k}
+        if epoch % CFG.save_every_epochs == 0 or is_best or epoch == CFG.epochs:
+            torch.save({"epoch": epoch, "model_state": state,
+                        "config": asdict(CFG), "metrics": metrics},
+                       Path(CFG.save_dir) / f"model_epoch{epoch}.pt")
+        if is_best:
+            torch.save({"epoch": epoch, "model_state": state,
+                        "config": asdict(CFG), "metrics": metrics},
+                       Path(CFG.save_dir) / "model_best.pt")
+        if ema is not None: ema.restore(raw_model, backup)
+        print(f"[SAVE] epoch {epoch}  best={'oui' if is_best else 'non'}")
+    with open(Path(CFG.save_dir) / "history.json", "w", encoding="utf-8") as f:
+        json.dump(history, f, ensure_ascii=False, indent=2)
+    tokenizer.save_pretrained(CFG.save_dir)
+    print(f"\n[OK] Best MRR = {best_mrr:.3f} -> {CFG.save_dir}/model_best.pt")
+# =============================================================================
+# 12. DÉMO
+# =============================================================================
+@torch.no_grad()
+def demo():
+    tokenizer = AutoTokenizer.from_pretrained(CFG.save_dir)
+    ckpt = torch.load(Path(CFG.save_dir) / "model_best.pt", map_location=device)
+    saved_cfg = ckpt["config"]
+    cfg2 = Config(**{k: v for k, v in saved_cfg.items() if hasattr(Config, k)})
+    cfg2.vocab_size = tokenizer.vocab_size
+    model = TextEncoder(cfg2).to(device).eval()
+    model.load_state_dict(ckpt["model_state"], strict=False)
+    corpus = [
+        "ARTICLE 12 - Les congés payés sont acquis à raison de 2,5 jours par mois travaillé.",
+        "Procédure de validation des notes de frais : transmettre via le portail RH avant le 5 du mois.",
+        "La politique RGPD impose un délai de 72h pour notifier une violation de données.",
+        "Le télétravail est autorisé jusqu'à 3 jours par semaine sur accord du manager.",
+        "Toute facture fournisseur doit être validée par le responsable budget avant paiement.",
+        "Formation obligatoire sécurité incendie : 1 fois par an, traçabilité dans le SIRH.",
+        "L'accord d'entreprise du 15/03/2024 fixe le taux de prime annuelle à 8% du salaire brut.",
+    ]
+    queries = [
+        "Combien de jours de congés je gagne par mois ?",
+        "Comment déclarer mes notes de frais ?",
+        "Quel est le quota de télétravail ?",
+        "Quel taux de prime annuelle ?",
+    ]
+    enc = tokenizer(corpus, padding=True, truncation=True,
+                    max_length=cfg2.max_seq_len, return_tensors="pt").to(device)
+    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        c_emb = model(enc["input_ids"], enc["attention_mask"])
+    print("\n[DEMO DOC-INTERNE-100M]")
+    for q in queries:
+        eq = tokenizer([q], padding=True, truncation=True,
+                       max_length=cfg2.max_seq_len, return_tensors="pt").to(device)
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            q_emb = model(eq["input_ids"], eq["attention_mask"])
+        sims = (q_emb @ c_emb.t()).squeeze(0)
+        top = sims.topk(3)
+        print(f"\nQ : {q}")
+        for s, i in zip(top.values, top.indices):
+            print(f"  ({s.item():.3f}) -> {corpus[i.item()]}")
+if __name__ == "__main__":
+    train()
+    try:
+        demo()
+    except Exception as e:
+        print(f"[demo] {e}")

rag_boolq_400m/checkpoints/training_info.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "latest_checkpoint": "rag_boolq_400m/checkpoints/clm_epoch_28.pt",
+  "latest_mtime": 1777473272.7815104,
+  "latest_mtime_iso": "2026-04-29T14:34:32.781510+00:00",
+  "size_bytes": 1643359381,
+  "epoch": 28
+}

rag_boolq_400m/local_finetuned/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# RAG Custom v6.2 POWER
+Profil: power_400m
+Paramètres: 190.66M
+Sauvegarde locale complète.

rag_boolq_400m/local_finetuned/config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "version": "6.2",
+  "profile": "power_400m",
+  "total_params_M": 190.66,
+  "encoder_config": {
+    "vocab_size": 36000,
+    "max_len": 640,
+    "d_model": 640,
+    "n_heads": 10,
+    "n_layers": 8,
+    "dim_ff": 2560,
+    "dropout": 0.1
+  },
+  "decoder_config": {
+    "vocab_size": 36000,
+    "max_len": 640,
+    "d_model": 768,
+    "n_heads": 12,
+    "n_layers": 14,
+    "dim_ff": 3072,
+    "dropout": 0.1
+  },
+  "project_dir": "/workspace/rag_boolq_400m",
+  "local_finetuned_dir": "/workspace/rag_boolq_400m/local_finetuned",
+  "generation": {
+    "max_new_tokens": 160,
+    "temperature": 0.72,
+    "top_k": 60,
+    "top_p": 0.92,
+    "beam_size": 3
+  },
+  "retrieval": {
+    "use_hybrid": true,
+    "rag_top_k": 12,
+    "sim_threshold": 0.045,
+    "min_support": 0.28
+  },
+  "metrics": {
+    "retrieval": {
+      "recall@12": 0.933,
+      "n": 120
+    },
+    "demo": {
+      "demo_pass": 4,
+      "demo_total": 5,
+      "demo_pct": 80.0
+    }
+  },
+  "saved_at": "2026-04-29 14:38:20"
+}

rag_boolq_400m/local_finetuned/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rag_boolq_400m/local_finetuned/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "[BOS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[EOS]",
+  "mask_token": "[MASK]",
+  "max_length": 640,
+  "model_max_length": 640,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "tokenizer_class": "TokenizersBackend",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

rag_boolq_400m/local_finetuned/tokenizer/training_info.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "latest_checkpoint": null,
+  "latest_mtime": null,
+  "latest_mtime_iso": null,
+  "size_bytes": null,
+  "epoch": null
+}

rag_boolq_400m/local_finetuned/training_info.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "latest_checkpoint": "rag_boolq_400m/local_finetuned/decoder_finetuned.pt",
+  "latest_mtime": 1777473500.6976762,
+  "latest_mtime_iso": "2026-04-29T14:38:20.697676+00:00",
+  "size_bytes": 620134204,
+  "epoch": null
+}

rag_boolq_400m/local_finetuned/training_summary.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "version": "6.2",
+  "profile": "power_400m",
+  "total_params_M": 190.66,
+  "encoder_config": {
+    "vocab_size": 36000,
+    "max_len": 640,
+    "d_model": 640,
+    "n_heads": 10,
+    "n_layers": 8,
+    "dim_ff": 2560,
+    "dropout": 0.1
+  },
+  "decoder_config": {
+    "vocab_size": 36000,
+    "max_len": 640,
+    "d_model": 768,
+    "n_heads": 12,
+    "n_layers": 14,
+    "dim_ff": 3072,
+    "dropout": 0.1
+  },
+  "project_dir": "/workspace/rag_boolq_400m",
+  "local_finetuned_dir": "/workspace/rag_boolq_400m/local_finetuned",
+  "generation": {
+    "max_new_tokens": 160,
+    "temperature": 0.72,
+    "top_k": 60,
+    "top_p": 0.92,
+    "beam_size": 3
+  },
+  "retrieval": {
+    "use_hybrid": true,
+    "rag_top_k": 12,
+    "sim_threshold": 0.045,
+    "min_support": 0.28
+  },
+  "metrics": {
+    "retrieval": {
+      "recall@12": 0.933,
+      "n": 120
+    },
+    "demo": {
+      "demo_pass": 4,
+      "demo_total": 5,
+      "demo_pct": 80.0
+    }
+  },
+  "saved_at": "2026-04-29 14:38:20"
+}

rag_boolq_400m/models/custom_bpe_v6_2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rag_boolq_400m/models/tokenizer_fast/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

rag_boolq_400m/models/tokenizer_fast/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "[BOS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[EOS]",
+  "mask_token": "[MASK]",
+  "max_length": 640,
+  "model_max_length": 640,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "tokenizer_class": "TokenizersBackend",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

rag_boolq_400m/models/tokenizer_fast/training_info.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "latest_checkpoint": null,
+  "latest_mtime": null,
+  "latest_mtime_iso": null,
+  "size_bytes": null,
+  "epoch": null
+}

rag_boolq_400m/models/training_info.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "latest_checkpoint": "rag_boolq_400m/models/decoder_v6_2.pt",
+  "latest_mtime": 1777473498.0736282,
+  "latest_mtime_iso": "2026-04-29T14:38:18.073628+00:00",
+  "size_bytes": 620133245,
+  "epoch": null
+}

rag_boolq_400m/summary_v6_2.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "version": "6.2",
+  "profile": "power_400m",
+  "vocab": 36000,
+  "max_len": 640,
+  "chunks": 108751,
+  "datasets": 23,
+  "total_params_M": 190.66,
+  "encoder_params_M": 63.29,
+  "decoder_params_M": 127.37,
+  "epochs": {
+    "mlm": 18,
+    "retriever": 16,
+    "clm": 28
+  },
+  "grad_accum": 12,
+  "retrieval": {
+    "recall@12": 0.933,
+    "n": 120
+  },
+  "demo": {
+    "demo_pass": 4,
+    "demo_total": 5,
+    "demo_pct": 80.0
+  },
+  "local_finetuned_dir": "/workspace/rag_boolq_400m/local_finetuned",
+  "project_dir": "/workspace/rag_boolq_400m"
+}

rag_v6_2_400m_domains/summary_v6_2.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "version": "6.2",
+  "profile": "power_400m",
+  "vocab": 48000,
+  "max_len": 1024,
+  "chunks": 108849,
+  "datasets": 37,
+  "dataset_groups": [
+    "single"
+  ],
+  "max_texts_per_dataset": 0,
+  "max_total_docs": 0,
+  "total_params_M": 450.67,
+  "encoder_params_M": 123.35,
+  "decoder_params_M": 327.32,
+  "epochs": {
+    "mlm": 18,
+    "retriever": 16,
+    "clm": 28
+  },
+  "grad_accum": 12,
+  "retrieval": {
+    "recall@12": 0.867,
+    "n": 120
+  },
+  "demo": {
+    "demo_pass": 4,
+    "demo_total": 5,
+    "demo_pct": 80.0
+  },
+  "local_finetuned_dir": "/workspace/rag_v6_2_400m_domains/local_finetuned",
+  "project_dir": "/workspace/rag_v6_2_400m_domains"
+}

security/cyber_unified.py ADDED Viewed

	@@ -0,0 +1,1370 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+train_all_models_10datasets.py
+Script unique pour entraîner localement :
+1. SecurityLLM                 -> LoRA SFT sur 10 datasets cyber
+2. Llama-Phishsense-1B          -> LoRA SFT sur 10 datasets cyber/phishing
+3. CySecBERT                    -> classifier phishing
+4. SecBERT                      -> classifier phishing
+Par défaut :
+- 10 datasets SFT pour les LLM
+- 3 epochs pour les LLM
+- 3 epochs pour BERT/SecBERT
+- entraînement séquentiel pour éviter de saturer RAM/GPU
+Structure attendue :
+security/
+├── train_all_models_10datasets.py
+├── models/
+│   ├── SecurityLLM/
+│   ├── Llama-Phishsense-1B/
+│   ├── CySecBERT/
+│   └── SecBERT/
+├── datasets/
+│   └── cybersecurity-rules/
+└── outputs/
+"""
+import os
+import gc
+import json
+import argparse
+import inspect
+from pathlib import Path
+from typing import Dict, Any, List, Tuple, Optional
+import numpy as np
+import torch
+from datasets import load_dataset, Dataset, concatenate_datasets
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    TaskType,
+    PeftModel,
+)
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+# ============================================================
+# Chemins locaux
+# ============================================================
+BASE_DIR = Path(__file__).resolve().parent
+DEFAULT_MODELS = {
+    "securityllm": BASE_DIR / "models" / "SecurityLLM",
+    "phishsense": BASE_DIR / "models" / "Llama-Phishsense-1B",
+    "cysecbert": BASE_DIR / "models" / "CySecBERT",
+    "secbert": BASE_DIR / "models" / "SecBERT",
+}
+DEFAULT_OUTPUT_DIR = BASE_DIR / "outputs"
+# ============================================================
+# 10 datasets pour les LLM
+# ============================================================
+MULTI_CYBER_DATASETS = [
+    {
+        "name": "local_cybersecurity_rules",
+        "dataset": str(BASE_DIR / "datasets" / "cybersecurity-rules"),
+        "max_samples": 0,
+    },
+    {
+        "name": "phishing_email_dataset",
+        "dataset": "zefang-liu/phishing-email-dataset",
+        "max_samples": 0,
+    },
+    {
+        "name": "trendyol_cybersecurity_instruction",
+        "dataset": "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset",
+        "max_samples": 20000,
+    },
+    {
+        "name": "cybersecurity_32k_instruction",
+        "dataset": "Vanessasml/cybersecurity_32k_instruction_input_output",
+        "max_samples": 12000,
+    },
+    {
+        "name": "cybersecurity_sharegpt",
+        "dataset": "ChaoticNeutrals/Cybersecurity-ShareGPT",
+        "max_samples": 12000,
+    },
+    {
+        "name": "cybersecurity_eval",
+        "dataset": "CyberNative/CyberSecurityEval",
+        "max_samples": 1000,
+    },
+    {
+        "name": "cybersecurity_corpus",
+        "dataset": "zeroshot/cybersecurity-corpus",
+        "max_samples": 1000,
+    },
+    {
+        "name": "practical_ai_for_cybersecurity",
+        "dataset": "Falah/Practical_AI_for_Cybersecurity",
+        "max_samples": 1000,
+    },
+    {
+        "name": "cybersecurity_llm_cve",
+        "dataset": "Bouquets/Cybersecurity-LLM-CVE",
+        "max_samples": 12000,
+    },
+    {
+        "name": "cve_llm_training",
+        "dataset": "morpheuslord/cve-llm-training",
+        "max_samples": 12000,
+    },
+]
+DEFAULT_PHISHING_DATASET = "zefang-liu/phishing-email-dataset"
+# ============================================================
+# Utilitaires généraux
+# ============================================================
+def log(title: str):
+    print("\n" + "=" * 100)
+    print(title)
+    print("=" * 100)
+def set_seed(seed: int = 42):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def cleanup_memory():
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def check_path(path: Path, name: str):
+    if not path.exists():
+        raise FileNotFoundError(f"{name} introuvable : {path}")
+def make_training_args(**kwargs):
+    """
+    Compatibilité avec plusieurs versions transformers.
+    Certaines versions utilisent evaluation_strategy, d'autres eval_strategy.
+    """
+    sig = inspect.signature(TrainingArguments.__init__)
+    allowed = set(sig.parameters.keys())
+    clean = {}
+    for k, v in kwargs.items():
+        if k in allowed:
+            clean[k] = v
+    if "evaluation_strategy" in kwargs and "eval_strategy" in allowed:
+        clean["eval_strategy"] = kwargs["evaluation_strategy"]
+    return TrainingArguments(**clean)
+def reduce_dataset(ds: Dataset, max_samples: int = 0) -> Dataset:
+    if max_samples and max_samples > 0 and len(ds) > max_samples:
+        return ds.select(range(max_samples))
+    return ds
+# ============================================================
+# Chargement dataset local ou HF
+# ============================================================
+def load_local_or_hf_dataset(dataset_ref: str, split: str = "train") -> Dataset:
+    """
+    Charge :
+    - dossier local contenant .jsonl/.json/.csv/.parquet
+    - fichier local
+    - dataset Hugging Face
+    """
+    path = Path(dataset_ref)
+    if path.exists():
+        if path.is_file():
+            suffix = path.suffix.lower()
+            files = [str(path)]
+            if suffix in [".json", ".jsonl"]:
+                return load_dataset("json", data_files=files, split=split)
+            if suffix == ".csv":
+                return load_dataset("csv", data_files=files, split=split)
+            if suffix == ".parquet":
+                return load_dataset("parquet", data_files=files, split=split)
+            raise RuntimeError(f"Format fichier non supporté : {path}")
+        jsonl_files = list(path.rglob("*.jsonl"))
+        json_files = list(path.rglob("*.json"))
+        csv_files = list(path.rglob("*.csv"))
+        parquet_files = list(path.rglob("*.parquet"))
+        if jsonl_files:
+            return load_dataset(
+                "json",
+                data_files=[str(f) for f in jsonl_files],
+                split=split,
+            )
+        if json_files:
+            return load_dataset(
+                "json",
+                data_files=[str(f) for f in json_files],
+                split=split,
+            )
+        if csv_files:
+            return load_dataset(
+                "csv",
+                data_files=[str(f) for f in csv_files],
+                split=split,
+            )
+        if parquet_files:
+            return load_dataset(
+                "parquet",
+                data_files=[str(f) for f in parquet_files],
+                split=split,
+            )
+        raise RuntimeError(f"Aucun fichier dataset lisible trouvé dans : {path}")
+    return load_dataset(dataset_ref, split=split)
+# ============================================================
+# Conversion multi-formats vers SFT text
+# ============================================================
+def safe_str(x) -> str:
+    if x is None:
+        return ""
+    return str(x).strip()
+def row_to_unified_sft_text(row: Dict[str, Any]) -> str:
+    """
+    Convertit plusieurs formats HF en format SFT.
+    Formats supportés :
+    - messages
+    - instruction/input/output
+    - system/user/assistant
+    - question/answer
+    - prompt/response
+    - text/label
+    - CVE-like
+    - fallback toutes colonnes
+    """
+    # 1. Format messages
+    if "messages" in row and row["messages"]:
+        try:
+            messages = row["messages"]
+            parts = []
+            for msg in messages:
+                if isinstance(msg, dict):
+                    role = safe_str(msg.get("role", "user")).upper()
+                    content = safe_str(msg.get("content", ""))
+                    if content:
+                        parts.append(f"{role}:\n{content}")
+            if parts:
+                return "\n\n".join(parts)
+        except Exception:
+            pass
+    # 2. Format system/user/assistant
+    system = safe_str(row.get("system", ""))
+    user = safe_str(row.get("user", ""))
+    assistant = safe_str(row.get("assistant", ""))
+    if user and assistant:
+        if not system:
+            system = "Tu es un assistant cybersécurité défensif."
+        return f"""### System:
+{system}
+### User:
+{user}
+### Assistant:
+{assistant}"""
+    # 3. Format instruction/input/output
+    instruction = safe_str(row.get("instruction", ""))
+    input_text = safe_str(row.get("input", ""))
+    output = safe_str(row.get("output", ""))
+    if instruction and output:
+        user_content = instruction
+        if input_text:
+            user_content += "\n\nContexte :\n" + input_text
+        return f"""### System:
+Tu es un assistant cybersécurité défensif.
+Tu privilégies l'analyse, la détection, la remédiation et la prévention.
+### User:
+{user_content}
+### Assistant:
+{output}"""
+    # 4. Format prompt / response / completion
+    prompt_keys = ["prompt", "Prompt", "query", "Query", "question", "Question", "problem"]
+    answer_keys = ["response", "Response", "completion", "Completion", "answer", "Answer", "solution"]
+    prompt = ""
+    answer = ""
+    for k in prompt_keys:
+        if k in row and safe_str(row.get(k)):
+            prompt = safe_str(row.get(k))
+            break
+    for k in answer_keys:
+        if k in row and safe_str(row.get(k)):
+            answer = safe_str(row.get(k))
+            break
+    if prompt and answer:
+        return f"""### System:
+Tu es un assistant cybersécurité défensif.
+### User:
+{prompt}
+### Assistant:
+{answer}"""
+    # 5. Format CVE-like
+    cve_keys = ["cve", "CVE", "cve_id", "CVE_ID", "id"]
+    desc_keys = ["description", "Description", "details", "Details", "summary"]
+    cve_id = ""
+    desc = ""
+    for k in cve_keys:
+        if k in row and safe_str(row.get(k)):
+            cve_id = safe_str(row.get(k))
+            break
+    for k in desc_keys:
+        if k in row and safe_str(row.get(k)):
+            desc = safe_str(row.get(k))
+            break
+    if cve_id or desc:
+        raw = "\n".join([f"{k}: {v}" for k, v in row.items() if v is not None])
+        return f"""### System:
+Tu es un assistant cybersécurité défensif spécialisé en vulnérabilités.
+### User:
+Analyse cette vulnérabilité et donne un résumé défensif, impact, priorité et remédiations.
+{raw}
+### Assistant:
+"""
+    # 6. Format phishing / classification
+    text_keys = [
+        "text",
+        "Text",
+        "email",
+        "Email",
+        "Email Text",
+        "body",
+        "Body",
+        "message",
+        "Message",
+        "content",
+        "Content",
+        "url",
+        "URL",
+        "text_combined",
+        "sentence",
+    ]
+    label_keys = [
+        "label",
+        "Label",
+        "class",
+        "Class",
+        "category",
+        "Category",
+        "is_phishing",
+        "phishing",
+        "status",
+        "type",
+    ]
+    text = ""
+    label = ""
+    for k in text_keys:
+        if k in row and safe_str(row.get(k)):
+            text = safe_str(row.get(k))
+            break
+    for k in label_keys:
+        if k in row and row.get(k) is not None:
+            label = safe_str(row.get(k))
+            break
+    if text:
+        return f"""### System:
+Tu es un assistant défensif spécialisé en cybersécurité.
+### User:
+Analyse ce contenu dans un contexte cybersécurité.
+Donne un verdict, les indices, le risque et les actions recommandées.
+{text}
+### Assistant:
+Label brut du dataset : {label}
+Analyse défensive :
+- Verdict :
+- Risque :
+- Indices :
+- Actions recommandées :
+"""
+    # 7. Fallback général
+    raw = "\n".join([f"{k}: {v}" for k, v in row.items() if v is not None])
+    return f"""### System:
+Tu es un assistant cybersécurité défensif.
+### User:
+Analyse ce contenu cyber :
+{raw}
+### Assistant:
+"""
+def load_one_sft_dataset(
+    dataset_ref: str,
+    name: str,
+    split: str = "train",
+    max_samples: int = 0,
+) -> Optional[Dataset]:
+    print(f"\n[+] Chargement dataset SFT : {name}")
+    print(f"    Source : {dataset_ref}")
+    try:
+        ds = load_local_or_hf_dataset(str(dataset_ref), split=split)
+    except Exception as e:
+        print(f"[ERREUR] Dataset ignoré : {name}")
+        print(f"Raison : {repr(e)}")
+        return None
+    try:
+        ds = reduce_dataset(ds, max_samples=max_samples)
+        print("[OK] Lignes :", len(ds))
+        print("[OK] Colonnes :", ds.column_names)
+        print("[OK] Exemple brut :", ds[0])
+    except Exception as e:
+        print(f"[ERREUR] Lecture impossible : {name}")
+        print(f"Raison : {repr(e)}")
+        return None
+    def mapper(row):
+        return {"text": row_to_unified_sft_text(row)}
+    try:
+        ds = ds.map(mapper, remove_columns=ds.column_names)
+        return ds
+    except Exception as e:
+        print(f"[ERREUR] Conversion SFT impossible : {name}")
+        print(f"Raison : {repr(e)}")
+        return None
+def load_multi_sft_dataset(
+    dataset_configs: List[Dict[str, Any]],
+    split: str = "train",
+    global_max_samples: int = 0,
+) -> Dataset:
+    datasets_list = []
+    for cfg in dataset_configs:
+        ds = load_one_sft_dataset(
+            dataset_ref=cfg["dataset"],
+            name=cfg["name"],
+            split=split,
+            max_samples=cfg.get("max_samples", 0),
+        )
+        if ds is not None and len(ds) > 0:
+            datasets_list.append(ds)
+    if not datasets_list:
+        raise RuntimeError("Aucun dataset SFT n'a pu être chargé.")
+    merged = concatenate_datasets(datasets_list)
+    merged = merged.shuffle(seed=42)
+    if global_max_samples and global_max_samples > 0 and len(merged) > global_max_samples:
+        merged = merged.select(range(global_max_samples))
+    print("\n[OK] Dataset SFT fusionné.")
+    print("[OK] Total lignes :", len(merged))
+    print("[OK] Exemple final :", merged[0])
+    return merged
+def tokenize_text_sft_dataset(
+    ds: Dataset,
+    tokenizer,
+    max_length: int,
+) -> Dataset:
+    def mapper(row):
+        encoded = tokenizer(
+            row["text"],
+            truncation=True,
+            max_length=max_length,
+            padding=False,
+        )
+        encoded["labels"] = encoded["input_ids"].copy()
+        return encoded
+    return ds.map(mapper, remove_columns=ds.column_names)
+# ============================================================
+# LoRA pour LLM
+# ============================================================
+def infer_lora_targets(model) -> List[str]:
+    """
+    Détection automatique des modules LoRA.
+    Compatible Llama/Mistral/Zephyr-like et plusieurs architectures.
+    """
+    common = [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+        "query",
+        "key",
+        "value",
+        "dense",
+        "fc1",
+        "fc2",
+    ]
+    found = set()
+    for name, module in model.named_modules():
+        last = name.split(".")[-1]
+        if last in common:
+            found.add(last)
+    found = sorted(found)
+    if not found:
+        raise RuntimeError(
+            "Impossible de détecter automatiquement les target_modules LoRA."
+        )
+    print("[+] Modules LoRA détectés :", found)
+    return found
+def train_llm_lora_multi_dataset(
+    model_path: Path,
+    dataset_configs: List[Dict[str, Any]],
+    output_dir: Path,
+    split: str,
+    global_max_samples: int,
+    epochs: float,
+    batch_size: int,
+    grad_accum: int,
+    lr: float,
+    max_length: int,
+    save_steps: int,
+    logging_steps: int,
+    lora_r: int,
+    lora_alpha: int,
+    lora_dropout: float,
+    skip_existing: bool,
+):
+    log(f"ENTRAÎNEMENT LLM LoRA MULTI-DATASETS : {model_path.name}")
+    check_path(model_path, f"Modèle {model_path.name}")
+    if skip_existing and output_dir.exists() and (output_dir / "adapter_config.json").exists():
+        print(f"[SKIP] Adapter LoRA déjà présent : {output_dir}")
+        return
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print("[+] Chargement tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        str(model_path),
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print("[+] Chargement modèle...")
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        str(model_path),
+        local_files_only=True,
+        trust_remote_code=True,
+        torch_dtype=dtype,
+        device_map="auto" if torch.cuda.is_available() else None,
+    )
+    if not torch.cuda.is_available():
+        model.to("cpu")
+    model.config.use_cache = False
+    if hasattr(model, "gradient_checkpointing_enable"):
+        model.gradient_checkpointing_enable()
+    target_modules = infer_lora_targets(model)
+    lora_config = LoraConfig(
+        r=lora_r,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
+        target_modules=target_modules,
+    )
+    print("[+] Application LoRA...")
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    print("[+] Chargement + fusion des 10 datasets...")
+    ds = load_multi_sft_dataset(
+        dataset_configs=dataset_configs,
+        split=split,
+        global_max_samples=global_max_samples,
+    )
+    print("[+] Tokenisation...")
+    tokenized = tokenize_text_sft_dataset(
+        ds,
+        tokenizer=tokenizer,
+        max_length=max_length,
+    )
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+    )
+    use_fp16 = torch.cuda.is_available()
+    use_bf16 = False
+    if torch.cuda.is_available():
+        try:
+            use_bf16 = torch.cuda.is_bf16_supported()
+            use_fp16 = not use_bf16
+        except Exception:
+            use_bf16 = False
+            use_fp16 = True
+    training_args = make_training_args(
+        output_dir=str(output_dir),
+        num_train_epochs=epochs,
+        per_device_train_batch_size=batch_size,
+        gradient_accumulation_steps=grad_accum,
+        learning_rate=lr,
+        fp16=use_fp16,
+        bf16=use_bf16,
+        logging_steps=logging_steps,
+        save_steps=save_steps,
+        save_total_limit=2,
+        report_to="none",
+        optim="adamw_torch",
+        warmup_ratio=0.03,
+        lr_scheduler_type="cosine",
+        remove_unused_columns=False,
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized,
+        data_collator=data_collator,
+    )
+    print("[+] Début entraînement LoRA...")
+    trainer.train()
+    print("[+] Sauvegarde adapter LoRA :", output_dir)
+    model.save_pretrained(str(output_dir))
+    tokenizer.save_pretrained(str(output_dir))
+    del trainer
+    del model
+    del tokenizer
+    cleanup_memory()
+    print("[OK] Entraînement LoRA terminé :", output_dir)
+# ============================================================
+# BERT classification
+# ============================================================
+def detect_text_label_columns(ds: Dataset) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Détection robuste des colonnes texte/label.
+    Compatible avec zefang-liu/phishing-email-dataset :
+    - Email Text
+    - Email Type
+    """
+    cols = ds.column_names
+    lower_map = {c.lower().strip(): c for c in cols}
+    text_candidates = [
+        "text",
+        "Text",
+        "email",
+        "Email",
+        "Email Text",
+        "email text",
+        "body",
+        "Body",
+        "message",
+        "Message",
+        "content",
+        "Content",
+        "url",
+        "URL",
+        "text_combined",
+        "sentence",
+    ]
+    label_candidates = [
+        "label",
+        "Label",
+        "class",
+        "Class",
+        "category",
+        "Category",
+        "Email Type",
+        "email type",
+        "type",
+        "Type",
+        "is_phishing",
+        "phishing",
+        "status",
+        "target",
+    ]
+    def find_column(candidates):
+        # Match exact insensible à la casse
+        for cand in candidates:
+            key = cand.lower().strip()
+            if key in lower_map:
+                return lower_map[key]
+        # Match partiel
+        for col in cols:
+            col_l = col.lower().strip()
+            for cand in candidates:
+                cand_l = cand.lower().strip()
+                if cand_l in col_l or col_l in cand_l:
+                    return col
+        return None
+    text_col = find_column(text_candidates)
+    label_col = find_column(label_candidates)
+    return text_col, label_col
+def normalize_labels(
+    ds: Dataset,
+    label_col: str,
+) -> Tuple[Dataset, Dict[str, int], Dict[int, str]]:
+    labels_raw = [str(x) for x in ds[label_col]]
+    unique = sorted(list(set(labels_raw)))
+    label2id = {label: i for i, label in enumerate(unique)}
+    id2label = {i: label for label, i in label2id.items()}
+    def mapper(row):
+        row["labels"] = label2id[str(row[label_col])]
+        return row
+    ds = ds.map(mapper)
+    return ds, label2id, id2label
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        labels,
+        preds,
+        average="weighted",
+        zero_division=0,
+    )
+    acc = accuracy_score(labels, preds)
+    return {
+        "accuracy": acc,
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+    }
+def train_bert_classifier(
+    model_path: Path,
+    dataset_ref: str,
+    output_dir: Path,
+    split: str,
+    max_samples: int,
+    epochs: float,
+    batch_size: int,
+    lr: float,
+    max_length: int,
+    logging_steps: int,
+    skip_existing: bool,
+):
+    log(f"ENTRAÎNEMENT BERT CLASSIFIER : {model_path.name}")
+    check_path(model_path, f"Modèle {model_path.name}")
+    if skip_existing and output_dir.exists() and (output_dir / "config.json").exists():
+        print(f"[SKIP] Classifier déjà présent : {output_dir}")
+        return
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print("[+] Chargement dataset classification :", dataset_ref)
+    ds = load_local_or_hf_dataset(str(dataset_ref), split=split)
+    ds = reduce_dataset(ds, max_samples=max_samples)
+    print("[+] Nombre d'exemples :", len(ds))
+    print("[+] Colonnes :", ds.column_names)
+    print("[+] Exemple brut :", ds[0])
+    text_col, label_col = detect_text_label_columns(ds)
+    if not text_col or not label_col:
+        raise ValueError(
+            "Impossible de détecter les colonnes texte/label.\n"
+            f"Colonnes disponibles : {ds.column_names}"
+        )
+    print("[+] Colonne texte :", text_col)
+    print("[+] Colonne label :", label_col)
+    ds, label2id, id2label = normalize_labels(ds, label_col)
+    split_ds = ds.train_test_split(test_size=0.15, seed=42)
+    train_ds = split_ds["train"]
+    eval_ds = split_ds["test"]
+    print("[+] Train size :", len(train_ds))
+    print("[+] Eval size :", len(eval_ds))
+    print("[+] Labels :", label2id)
+    tokenizer = AutoTokenizer.from_pretrained(
+        str(model_path),
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    def tok(batch):
+        return tokenizer(
+            batch[text_col],
+            truncation=True,
+            padding="max_length",
+            max_length=max_length,
+        )
+    train_ds = train_ds.map(tok, batched=True)
+    eval_ds = eval_ds.map(tok, batched=True)
+    keep = ["input_ids", "attention_mask", "labels"]
+    train_ds = train_ds.remove_columns(
+        [c for c in train_ds.column_names if c not in keep]
+    )
+    eval_ds = eval_ds.remove_columns(
+        [c for c in eval_ds.column_names if c not in keep]
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        str(model_path),
+        local_files_only=True,
+        trust_remote_code=True,
+        num_labels=len(label2id),
+        label2id=label2id,
+        id2label=id2label,
+        ignore_mismatched_sizes=True,
+    )
+    use_fp16 = torch.cuda.is_available()
+    training_args = make_training_args(
+        output_dir=str(output_dir),
+        num_train_epochs=epochs,
+        per_device_train_batch_size=batch_size,
+        per_device_eval_batch_size=batch_size,
+        learning_rate=lr,
+        fp16=use_fp16,
+        logging_steps=logging_steps,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        save_total_limit=2,
+        report_to="none",
+        load_best_model_at_end=True,
+        metric_for_best_model="f1",
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds,
+        eval_dataset=eval_ds,
+        compute_metrics=compute_metrics,
+    )
+    print("[+] Début entraînement classifier...")
+    trainer.train()
+    print("[+] Évaluation finale...")
+    metrics = trainer.evaluate()
+    print(metrics)
+    print("[+] Sauvegarde classifier :", output_dir)
+    trainer.save_model(str(output_dir))
+    tokenizer.save_pretrained(str(output_dir))
+    with open(output_dir / "label_mapping.json", "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "label2id": label2id,
+                "id2label": id2label,
+                "text_col": text_col,
+                "label_col": label_col,
+                "metrics": metrics,
+            },
+            f,
+            ensure_ascii=False,
+            indent=2,
+        )
+    del trainer
+    del model
+    del tokenizer
+    cleanup_memory()
+    print("[OK] Entraînement BERT terminé :", output_dir)
+# ============================================================
+# Tests après entraînement
+# ============================================================
+def test_lora_adapter(
+    base_model: Path,
+    adapter_dir: Path,
+    prompt: str,
+    max_new_tokens: int = 250,
+):
+    log(f"TEST LoRA : {adapter_dir.name}")
+    if not adapter_dir.exists():
+        print("[SKIP] Adapter introuvable :", adapter_dir)
+        return
+    tokenizer = AutoTokenizer.from_pretrained(
+        str(base_model),
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    base = AutoModelForCausalLM.from_pretrained(
+        str(base_model),
+        local_files_only=True,
+        trust_remote_code=True,
+        torch_dtype=dtype,
+        device_map="auto" if torch.cuda.is_available() else None,
+    )
+    model = PeftModel.from_pretrained(base, str(adapter_dir))
+    model.eval()
+    full_prompt = f"""### System:
+Tu es un assistant cybersécurité défensif.
+### User:
+{prompt}
+### Assistant:
+"""
+    inputs = tokenizer(full_prompt, return_tensors="pt")
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=0.2,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    print(tokenizer.decode(out[0], skip_special_tokens=True))
+    del model
+    del base
+    del tokenizer
+    cleanup_memory()
+def test_bert_classifier(model_dir: Path, text: str):
+    log(f"TEST BERT CLASSIFIER : {model_dir.name}")
+    if not model_dir.exists():
+        print("[SKIP] Classifier introuvable :", model_dir)
+        return
+    tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
+    model = AutoModelForSequenceClassification.from_pretrained(str(model_dir))
+    model.eval()
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=256,
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model(**inputs)
+        probs = torch.softmax(out.logits, dim=-1)[0].detach().cpu().numpy()
+    id2label = model.config.id2label
+    for idx, prob in enumerate(probs):
+        print(f"{id2label[idx]}: {prob:.4f}")
+    del model
+    del tokenizer
+    cleanup_memory()
+# ============================================================
+# Orchestration
+# ============================================================
+def train_selected(args):
+    set_seed(args.seed)
+    models = {
+        "securityllm": Path(args.security_model),
+        "phishsense": Path(args.phish_model),
+        "cysecbert": Path(args.cysecbert_model),
+        "secbert": Path(args.secbert_model),
+    }
+    outputs = {
+        "securityllm": Path(args.output_dir) / "securityllm-10datasets-lora",
+        "phishsense": Path(args.output_dir) / "phishsense-10datasets-lora",
+        "cysecbert": Path(args.output_dir) / "cysecbert-phishing-classifier",
+        "secbert": Path(args.output_dir) / "secbert-phishing-classifier",
+    }
+    if args.train == "all":
+        selected = ["securityllm", "phishsense", "cysecbert", "secbert"]
+    else:
+        selected = [args.train]
+    print("[+] Modèles sélectionnés :", selected)
+    if "securityllm" in selected:
+        train_llm_lora_multi_dataset(
+            model_path=models["securityllm"],
+            dataset_configs=MULTI_CYBER_DATASETS,
+            output_dir=outputs["securityllm"],
+            split=args.split,
+            global_max_samples=args.max_samples,
+            epochs=args.llm_epochs,
+            batch_size=args.llm_batch_size,
+            grad_accum=args.grad_accum,
+            lr=args.llm_lr,
+            max_length=args.llm_max_length,
+            save_steps=args.save_steps,
+            logging_steps=args.logging_steps,
+            lora_r=args.lora_r,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            skip_existing=args.skip_existing,
+        )
+    if "phishsense" in selected:
+        train_llm_lora_multi_dataset(
+            model_path=models["phishsense"],
+            dataset_configs=MULTI_CYBER_DATASETS,
+            output_dir=outputs["phishsense"],
+            split=args.split,
+            global_max_samples=args.max_samples,
+            epochs=args.llm_epochs,
+            batch_size=args.llm_batch_size,
+            grad_accum=args.grad_accum,
+            lr=args.llm_lr,
+            max_length=args.llm_max_length,
+            save_steps=args.save_steps,
+            logging_steps=args.logging_steps,
+            lora_r=args.lora_r,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            skip_existing=args.skip_existing,
+        )
+    if "cysecbert" in selected:
+        train_bert_classifier(
+            model_path=models["cysecbert"],
+            dataset_ref=args.phishing_dataset,
+            output_dir=outputs["cysecbert"],
+            split=args.split,
+            max_samples=args.bert_max_samples,
+            epochs=args.bert_epochs,
+            batch_size=args.bert_batch_size,
+            lr=args.bert_lr,
+            max_length=args.bert_max_length,
+            logging_steps=args.logging_steps,
+            skip_existing=args.skip_existing,
+        )
+    if "secbert" in selected:
+        train_bert_classifier(
+            model_path=models["secbert"],
+            dataset_ref=args.phishing_dataset,
+            output_dir=outputs["secbert"],
+            split=args.split,
+            max_samples=args.bert_max_samples,
+            epochs=args.bert_epochs,
+            batch_size=args.bert_batch_size,
+            lr=args.bert_lr,
+            max_length=args.bert_max_length,
+            logging_steps=args.logging_steps,
+            skip_existing=args.skip_existing,
+        )
+    print("\n[OK] Pipeline terminé.")
+def run_tests(args):
+    outputs = {
+        "securityllm": Path(args.output_dir) / "securityllm-10datasets-lora",
+        "phishsense": Path(args.output_dir) / "phishsense-10datasets-lora",
+        "cysecbert": Path(args.output_dir) / "cysecbert-phishing-classifier",
+        "secbert": Path(args.output_dir) / "secbert-phishing-classifier",
+    }
+    test_lora_adapter(
+        base_model=Path(args.security_model),
+        adapter_dir=outputs["securityllm"],
+        prompt="Explique une règle Sigma permettant de détecter PowerShell EncodedCommand de manière défensive.",
+    )
+    test_lora_adapter(
+        base_model=Path(args.phish_model),
+        adapter_dir=outputs["phishsense"],
+        prompt="Analyse cet email : Votre compte sera suspendu. Cliquez ici pour confirmer votre mot de passe.",
+    )
+    test_bert_classifier(
+        model_dir=outputs["cysecbert"],
+        text="Your account will be suspended. Click here to verify your password.",
+    )
+    test_bert_classifier(
+        model_dir=outputs["secbert"],
+        text="Your account will be suspended. Click here to verify your password.",
+    )
+# ============================================================
+# Main CLI
+# ============================================================
+def main():
+    parser = argparse.ArgumentParser(
+        description="Entraîner tous les modèles cyber locaux avec 10 datasets et 3 epochs."
+    )
+    parser.add_argument(
+        "--train",
+        default="all",
+        choices=["all", "securityllm", "phishsense", "cysecbert", "secbert"],
+        help="Quel modèle entraîner.",
+    )
+    parser.add_argument(
+        "--test-after",
+        action="store_true",
+        help="Tester les modèles/adapters après entraînement.",
+    )
+    parser.add_argument(
+        "--skip-existing",
+        action="store_true",
+        help="Ignorer un entraînement si la sortie existe déjà.",
+    )
+    parser.add_argument("--seed", type=int, default=42)
+    # Modèles locaux
+    parser.add_argument(
+        "--security-model",
+        default=str(DEFAULT_MODELS["securityllm"]),
+    )
+    parser.add_argument(
+        "--phish-model",
+        default=str(DEFAULT_MODELS["phishsense"]),
+    )
+    parser.add_argument(
+        "--cysecbert-model",
+        default=str(DEFAULT_MODELS["cysecbert"]),
+    )
+    parser.add_argument(
+        "--secbert-model",
+        default=str(DEFAULT_MODELS["secbert"]),
+    )
+    # Dataset classification BERT
+    parser.add_argument(
+        "--phishing-dataset",
+        default=DEFAULT_PHISHING_DATASET,
+    )
+    parser.add_argument("--split", default="train")
+    # Sorties
+    parser.add_argument(
+        "--output-dir",
+        default=str(DEFAULT_OUTPUT_DIR),
+    )
+    # Limitation globale LLM
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=0,
+        help="Limiter le nombre total d'exemples SFT fusionnés. 0 = pas de limite globale.",
+    )
+    # Limitation BERT
+    parser.add_argument(
+        "--bert-max-samples",
+        type=int,
+        default=0,
+        help="Limiter le nombre d'exemples pour BERT. 0 = pas de limite.",
+    )
+    # Paramètres LLM LoRA
+    parser.add_argument("--llm-epochs", type=float, default=3.0)
+    parser.add_argument("--llm-batch-size", type=int, default=1)
+    parser.add_argument("--grad-accum", type=int, default=8)
+    parser.add_argument("--llm-lr", type=float, default=2e-4)
+    parser.add_argument("--llm-max-length", type=int, default=1024)
+    parser.add_argument("--lora-r", type=int, default=16)
+    parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument("--lora-dropout", type=float, default=0.05)
+    # Paramètres BERT
+    parser.add_argument("--bert-epochs", type=float, default=3.0)
+    parser.add_argument("--bert-batch-size", type=int, default=8)
+    parser.add_argument("--bert-lr", type=float, default=2e-5)
+    parser.add_argument("--bert-max-length", type=int, default=256)
+    # Logs / sauvegarde
+    parser.add_argument("--logging-steps", type=int, default=10)
+    parser.add_argument("--save-steps", type=int, default=200)
+    args = parser.parse_args()
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    train_selected(args)
+    if args.test_after:
+        run_tests(args)
+if __name__ == "__main__":
+    main()

security/sec.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import os
+import argparse
+from pathlib import Path
+import torch
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
+from datasets import load_dataset
+BASE_DIR = Path(__file__).resolve().parent
+MODELS_DIR = BASE_DIR / "models"
+DATASETS_DIR = BASE_DIR / "datasets"
+REPOS = {
+    "SecurityLLM": {
+        "repo_id": "ZySec-AI/SecurityLLM",
+        "repo_type": "model",
+        "local_dir": MODELS_DIR / "SecurityLLM",
+        "kind": "causal_lm",
+    },
+    "Llama-Phishsense-1B": {
+        "repo_id": "AcuteShrewdSecurity/Llama-Phishsense-1B",
+        "repo_type": "model",
+        "local_dir": MODELS_DIR / "Llama-Phishsense-1B",
+        "kind": "causal_lm",
+    },
+    "CySecBERT": {
+        "repo_id": "markusbayer/CySecBERT",
+        "repo_type": "model",
+        "local_dir": MODELS_DIR / "CySecBERT",
+        "kind": "bert",
+    },
+    "SecBERT": {
+        "repo_id": "jackaduma/SecBERT",
+        "repo_type": "model",
+        "local_dir": MODELS_DIR / "SecBERT",
+        "kind": "bert",
+    },
+    "cybersecurity-rules": {
+        "repo_id": "jcordon5/cybersecurity-rules",
+        "repo_type": "dataset",
+        "local_dir": DATASETS_DIR / "cybersecurity-rules",
+        "kind": "dataset",
+    },
+}
+def download_all():
+    MODELS_DIR.mkdir(exist_ok=True)
+    DATASETS_DIR.mkdir(exist_ok=True)
+    for name, item in REPOS.items():
+        print(f"\n[+] Téléchargement : {name}")
+        print(f"    Repo   : {item['repo_id']}")
+        print(f"    Dossier: {item['local_dir']}")
+        snapshot_download(
+            repo_id=item["repo_id"],
+            repo_type=item["repo_type"],
+            local_dir=str(item["local_dir"]),
+            resume_download=True,
+        )
+        print(f"[OK] {name} téléchargé.")
+def check_files():
+    print("\n[+] Vérification des fichiers locaux")
+    for name, item in REPOS.items():
+        path = item["local_dir"]
+        print(f"\n--- {name} ---")
+        print(f"Dossier : {path}")
+        if not path.exists():
+            print("[ERREUR] Dossier introuvable.")
+            continue
+        files = list(path.glob("*"))
+        if not files:
+            print("[ERREUR] Dossier vide.")
+            continue
+        for file in files[:20]:
+            print(" ", file.name)
+        if item["kind"] in ["causal_lm", "bert"]:
+            config = path / "config.json"
+            if config.exists():
+                print("[OK] config.json trouvé.")
+            else:
+                print("[ATTENTION] config.json absent.")
+        print("[OK] Vérification terminée.")
+def set_offline_mode():
+    os.environ["HF_HUB_OFFLINE"] = "1"
+    os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    os.environ["HF_DATASETS_OFFLINE"] = "1"
+def test_causal_lm(name, path, prompt):
+    print(f"\n[+] Test modèle génératif : {name}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        str(path),
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        str(path),
+        local_files_only=True,
+        trust_remote_code=True,
+        torch_dtype=dtype,
+        device_map="auto" if torch.cuda.is_available() else None,
+    )
+    if not torch.cuda.is_available():
+        model.to("cpu")
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=120,
+            temperature=0.2,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    text = tokenizer.decode(output[0], skip_special_tokens=True)
+    print("\n===== SORTIE MODÈLE =====")
+    print(text)
+    print("=========================")
+def test_bert(name, path):
+    print(f"\n[+] Test BERT : {name}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        str(path),
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    model = AutoModel.from_pretrained(
+        str(path),
+        local_files_only=True,
+        trust_remote_code=True,
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    model.eval()
+    text = "Suspicious PowerShell encoded command execution detected."
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=512,
+    )
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+    embedding = outputs.last_hidden_state[:, 0, :]
+    print("[OK] Modèle chargé.")
+    print("Texte :", text)
+    print("Shape embedding :", tuple(embedding.shape))
+def find_dataset_files(path):
+    parquet_files = list(path.rglob("*.parquet"))
+    json_files = list(path.rglob("*.json")) + list(path.rglob("*.jsonl"))
+    csv_files = list(path.rglob("*.csv"))
+    if parquet_files:
+        return "parquet", [str(f) for f in parquet_files]
+    if json_files:
+        return "json", [str(f) for f in json_files]
+    if csv_files:
+        return "csv", [str(f) for f in csv_files]
+    return None, []
+def test_dataset(path):
+    print("\n[+] Test dataset cybersecurity-rules")
+    dataset_type, files = find_dataset_files(path)
+    if dataset_type is None:
+        print("[ERREUR] Aucun fichier parquet/json/jsonl/csv trouvé.")
+        print("Fichiers présents :")
+        for f in list(path.rglob("*"))[:30]:
+            print(" ", f)
+        return
+    print(f"[OK] Type détecté : {dataset_type}")
+    print(f"[OK] Nombre de fichiers : {len(files)}")
+    ds = load_dataset(dataset_type, data_files=files, split="train")
+    print("[OK] Dataset chargé.")
+    print("Nombre de lignes :", len(ds))
+    print("\n===== PREMIÈRE LIGNE =====")
+    print(ds[0])
+    print("==========================")
+def test_all():
+    set_offline_mode()
+    check_files()
+    # Test SecurityLLM
+    test_causal_lm(
+        "SecurityLLM",
+        REPOS["SecurityLLM"]["local_dir"],
+        "Tu es un analyste SOC. Donne une procédure défensive pour analyser une alerte SSH brute force.",
+    )
+    # Test Llama-Phishsense-1B
+    test_causal_lm(
+        "Llama-Phishsense-1B",
+        REPOS["Llama-Phishsense-1B"]["local_dir"],
+        "Analyse ce message pour phishing : Votre compte sera suspendu. Cliquez ici pour confirmer votre mot de passe.",
+    )
+    # Test CySecBERT
+    test_bert(
+        "CySecBERT",
+        REPOS["CySecBERT"]["local_dir"],
+    )
+    # Test SecBERT
+    test_bert(
+        "SecBERT",
+        REPOS["SecBERT"]["local_dir"],
+    )
+    # Test dataset
+    test_dataset(
+        REPOS["cybersecurity-rules"]["local_dir"],
+    )
+def test_one(name):
+    set_offline_mode()
+    if name not in REPOS:
+        print("[ERREUR] Nom inconnu.")
+        print("Noms possibles :", ", ".join(REPOS.keys()))
+        return
+    item = REPOS[name]
+    if item["kind"] == "causal_lm":
+        test_causal_lm(
+            name,
+            item["local_dir"],
+            "Donne une analyse cybersécurité défensive courte.",
+        )
+    elif item["kind"] == "bert":
+        test_bert(name, item["local_dir"])
+    elif item["kind"] == "dataset":
+        test_dataset(item["local_dir"])
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--download",
+        action="store_true",
+        help="Télécharger tous les modèles et datasets en local.",
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Vérifier les fichiers téléchargés.",
+    )
+    parser.add_argument(
+        "--test-all",
+        action="store_true",
+        help="Tester tous les modèles localement.",
+    )
+    parser.add_argument(
+        "--test-one",
+        type=str,
+        help="Tester un seul modèle : SecurityLLM, Llama-Phishsense-1B, CySecBERT, SecBERT, cybersecurity-rules",
+    )
+    args = parser.parse_args()
+    if args.download:
+        download_all()
+    if args.check:
+        check_files()
+    if args.test_all:
+        test_all()
+    if args.test_one:
+        test_one(args.test_one)
+    if not any([args.download, args.check, args.test_all, args.test_one]):
+        parser.print_help()
+if __name__ == "__main__":
+    main()