import os import json from pathlib import Path from datetime import datetime from datasets import load_dataset, DatasetDict from huggingface_hub import login, create_repo, upload_folder from transformers import ( AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, ) from peft import LoraConfig, get_peft_model # ------------- Config utilisateur ------------- HF_TOKEN = os.environ["HF_TOKEN"] # ⚠️ récupère ton token depuis l'environnement BASE_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf" ALIGNED_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf-aligned" DATASET_REPO_ID = "Gopu-poss/gopu-agent-2k-fdf-dataset-prepared" OUTPUT_DIR = "./gopu-lora-out" SEED = 42 # ------------- Auth ------------- print(">> Logging into Hugging Face Hub...") login(token=HF_TOKEN) # ------------- Chargement des datasets bruts ------------- print(">> Loading HuggingFaceFW/finewiki (train split)...") finewiki = load_dataset("HuggingFaceFW/finewiki", split="train") print(">> Loading fka/awesome-chatgpt-prompts (train split)...") awesome = load_dataset("fka/awesome-chatgpt-prompts", split="train") # ------------- Échantillonnage / préparation ------------- FW_SAMPLE_SIZE = 20000 if len(finewiki) > FW_SAMPLE_SIZE: finewiki = finewiki.shuffle(seed=SEED).select(range(FW_SAMPLE_SIZE)) print(f">> finewiki sampled: {len(finewiki)} rows; awesome: {len(awesome)} rows") # ------------- Normalisation en instruction / input / output ------------- def map_finewiki(example): title = example.get("title", "") text = example.get("text", "") instruction = f"Explique en termes clairs et techniques l'article: {title}" input_ctx = text[:2000] output = ( "Résumé technique et stylisé (GopuOS): " "Points clés, concepts, et relations. Maintiens un ton clair, concis, et agentique." ) return {"instruction": instruction, "input": input_ctx, "output": output} finewiki_mapped = finewiki.map(map_finewiki) def map_awesome(example): act = example.get("act", "") prompt = example.get("prompt", "") instruction = f"Rôle/acte: {act}. Réponds au prompt en style GopuOS." input_ctx = prompt output = ( "Réponse alignée GopuOS: concise, technique, introspectable, bilingue possible FR/EN." ) return {"instruction": instruction, "input": input_ctx, "output": output} awesome_mapped = awesome.map(map_awesome) prepared = DatasetDict({ "train": finewiki_mapped, "eval": awesome_mapped }) # ------------- Sauvegarde locale du dataset prétraité ------------- prepared_dir = Path("./prepared_dataset") prepared_dir.mkdir(parents=True, exist_ok=True) for split in prepared.keys(): out_path = prepared_dir / f"{split}.jsonl" with out_path.open("w", encoding="utf-8") as f: for ex in prepared[split]: f.write(json.dumps(ex, ensure_ascii=False) + "\n") # ------------- Push du dataset prétraité sur le Hub ------------- print(f">> Creating/updating dataset repo: {DATASET_REPO_ID}") create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", private=False, exist_ok=True) upload_folder( repo_id=DATASET_REPO_ID, repo_type="dataset", folder_path=str(prepared_dir), token=HF_TOKEN, commit_message=f"Prepared dataset push {datetime.utcnow().isoformat()}", ) # ------------- Chargement modèle/tokenizer ------------- print(f">> Loading base model: {BASE_MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN) model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN) # ------------- PEFT LoRA config ------------- peft_config = LoraConfig( r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) model = get_peft_model(model, peft_config) # ------------- Tokenization ------------- def format_example(ex): system_prompt = ( "Tu es Gopu, agent intelligent de GopuOS. Réponds de manière claire, technique, stylisée, et introspectable." ) user = f"Utilisateur: {ex['instruction']}\nContexte: {ex['input']}\nGopu:" target = ex["output"] src = system_prompt + "\n\n" + user return {"src": src, "tgt": target} formatted = prepared.map(format_example) def tokenize(batch): model_inputs = tokenizer( batch["src"], truncation=True, max_length=1024, ) with tokenizer.as_target_tokenizer(): labels = tokenizer( batch["tgt"], truncation=True, max_length=256, ) model_inputs["labels"] = labels["input_ids"] return model_inputs tokenized_train = formatted["train"].map(tokenize, batched=False, remove_columns=formatted["train"].column_names) tokenized_eval = formatted["eval"].map(tokenize, batched=False, remove_columns=formatted["eval"].column_names) data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest") # ------------- Entraînement ------------- training_args = TrainingArguments( output_dir=OUTPUT_DIR, per_device_train_batch_size=4, per_device_eval_batch_size=4, gradient_accumulation_steps=2, eval_strategy="steps", eval_steps=200, logging_steps=50, save_steps=500, save_total_limit=2, num_train_epochs=1, learning_rate=2e-4, warmup_steps=200, weight_decay=0.01, fp16=True, bf16=False, report_to=[], seed=SEED, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=tokenized_train, eval_dataset=tokenized_eval, ) print(">> Starting training...") trainer.train() print(">> Training complete") # ------------- Sauvegarde et push du modèle ------------- print(f">> Creating/updating model repo: {ALIGNED_MODEL_ID}") create_repo(repo_id=ALIGNED_MODEL_ID, token=HF_TOKEN, repo_type="model", private=False, exist_ok=True) trainer.save_model(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) upload_folder( repo_id=ALIGNED_MODEL_ID, repo_type="model", folder_path=OUTPUT_DIR, token=HF_TOKEN, commit_message=f"Push aligned LoRA model {datetime.utcnow().isoformat()}", ) print(f">> Model pushed: https://huggingface.co/{ALIGNED_MODEL_ID}")