Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| from pathlib import Path | |
| from datetime import datetime | |
| from datasets import load_dataset, DatasetDict | |
| from huggingface_hub import login, create_repo, upload_folder | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| DataCollatorForSeq2Seq, | |
| TrainingArguments, | |
| Trainer, | |
| ) | |
| from peft import LoraConfig, get_peft_model | |
| # ------------- Config utilisateur ------------- | |
| HF_TOKEN = os.environ["HF_TOKEN"] # ⚠️ récupère ton token depuis l'environnement | |
| BASE_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf" | |
| ALIGNED_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf-aligned" | |
| DATASET_REPO_ID = "Gopu-poss/gopu-agent-2k-fdf-dataset-prepared" | |
| OUTPUT_DIR = "./gopu-lora-out" | |
| SEED = 42 | |
| # ------------- Auth ------------- | |
| print(">> Logging into Hugging Face Hub...") | |
| login(token=HF_TOKEN) | |
| # ------------- Chargement des datasets bruts ------------- | |
| print(">> Loading HuggingFaceFW/finewiki (train split)...") | |
| finewiki = load_dataset("HuggingFaceFW/finewiki", split="train") | |
| print(">> Loading fka/awesome-chatgpt-prompts (train split)...") | |
| awesome = load_dataset("fka/awesome-chatgpt-prompts", split="train") | |
| # ------------- Échantillonnage / préparation ------------- | |
| FW_SAMPLE_SIZE = 20000 | |
| if len(finewiki) > FW_SAMPLE_SIZE: | |
| finewiki = finewiki.shuffle(seed=SEED).select(range(FW_SAMPLE_SIZE)) | |
| print(f">> finewiki sampled: {len(finewiki)} rows; awesome: {len(awesome)} rows") | |
| # ------------- Normalisation en instruction / input / output ------------- | |
| def map_finewiki(example): | |
| title = example.get("title", "") | |
| text = example.get("text", "") | |
| instruction = f"Explique en termes clairs et techniques l'article: {title}" | |
| input_ctx = text[:2000] | |
| output = ( | |
| "Résumé technique et stylisé (GopuOS): " | |
| "Points clés, concepts, et relations. Maintiens un ton clair, concis, et agentique." | |
| ) | |
| return {"instruction": instruction, "input": input_ctx, "output": output} | |
| finewiki_mapped = finewiki.map(map_finewiki) | |
| def map_awesome(example): | |
| act = example.get("act", "") | |
| prompt = example.get("prompt", "") | |
| instruction = f"Rôle/acte: {act}. Réponds au prompt en style GopuOS." | |
| input_ctx = prompt | |
| output = ( | |
| "Réponse alignée GopuOS: concise, technique, introspectable, bilingue possible FR/EN." | |
| ) | |
| return {"instruction": instruction, "input": input_ctx, "output": output} | |
| awesome_mapped = awesome.map(map_awesome) | |
| prepared = DatasetDict({ | |
| "train": finewiki_mapped, | |
| "eval": awesome_mapped | |
| }) | |
| # ------------- Sauvegarde locale du dataset prétraité ------------- | |
| prepared_dir = Path("./prepared_dataset") | |
| prepared_dir.mkdir(parents=True, exist_ok=True) | |
| for split in prepared.keys(): | |
| out_path = prepared_dir / f"{split}.jsonl" | |
| with out_path.open("w", encoding="utf-8") as f: | |
| for ex in prepared[split]: | |
| f.write(json.dumps(ex, ensure_ascii=False) + "\n") | |
| # ------------- Push du dataset prétraité sur le Hub ------------- | |
| print(f">> Creating/updating dataset repo: {DATASET_REPO_ID}") | |
| create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", private=False, exist_ok=True) | |
| upload_folder( | |
| repo_id=DATASET_REPO_ID, | |
| repo_type="dataset", | |
| folder_path=str(prepared_dir), | |
| token=HF_TOKEN, | |
| commit_message=f"Prepared dataset push {datetime.utcnow().isoformat()}", | |
| ) | |
| # ------------- Chargement modèle/tokenizer ------------- | |
| print(f">> Loading base model: {BASE_MODEL_ID}") | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN) | |
| model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN) | |
| # ------------- PEFT LoRA config ------------- | |
| peft_config = LoraConfig( | |
| r=8, | |
| lora_alpha=16, | |
| target_modules=["q_proj", "v_proj"], | |
| lora_dropout=0.05, | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| model = get_peft_model(model, peft_config) | |
| # ------------- Tokenization ------------- | |
| def format_example(ex): | |
| system_prompt = ( | |
| "Tu es Gopu, agent intelligent de GopuOS. Réponds de manière claire, technique, stylisée, et introspectable." | |
| ) | |
| user = f"Utilisateur: {ex['instruction']}\nContexte: {ex['input']}\nGopu:" | |
| target = ex["output"] | |
| src = system_prompt + "\n\n" + user | |
| return {"src": src, "tgt": target} | |
| formatted = prepared.map(format_example) | |
| def tokenize(batch): | |
| model_inputs = tokenizer( | |
| batch["src"], | |
| truncation=True, | |
| max_length=1024, | |
| ) | |
| with tokenizer.as_target_tokenizer(): | |
| labels = tokenizer( | |
| batch["tgt"], | |
| truncation=True, | |
| max_length=256, | |
| ) | |
| model_inputs["labels"] = labels["input_ids"] | |
| return model_inputs | |
| tokenized_train = formatted["train"].map(tokenize, batched=False, remove_columns=formatted["train"].column_names) | |
| tokenized_eval = formatted["eval"].map(tokenize, batched=False, remove_columns=formatted["eval"].column_names) | |
| data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest") | |
| # ------------- Entraînement ------------- | |
| training_args = TrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, | |
| gradient_accumulation_steps=2, | |
| eval_strategy="steps", | |
| eval_steps=200, | |
| logging_steps=50, | |
| save_steps=500, | |
| save_total_limit=2, | |
| num_train_epochs=1, | |
| learning_rate=2e-4, | |
| warmup_steps=200, | |
| weight_decay=0.01, | |
| fp16=True, | |
| bf16=False, | |
| report_to=[], | |
| seed=SEED, | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| data_collator=data_collator, | |
| train_dataset=tokenized_train, | |
| eval_dataset=tokenized_eval, | |
| ) | |
| print(">> Starting training...") | |
| trainer.train() | |
| print(">> Training complete") | |
| # ------------- Sauvegarde et push du modèle ------------- | |
| print(f">> Creating/updating model repo: {ALIGNED_MODEL_ID}") | |
| create_repo(repo_id=ALIGNED_MODEL_ID, token=HF_TOKEN, repo_type="model", private=False, exist_ok=True) | |
| trainer.save_model(OUTPUT_DIR) | |
| tokenizer.save_pretrained(OUTPUT_DIR) | |
| upload_folder( | |
| repo_id=ALIGNED_MODEL_ID, | |
| repo_type="model", | |
| folder_path=OUTPUT_DIR, | |
| token=HF_TOKEN, | |
| commit_message=f"Push aligned LoRA model {datetime.utcnow().isoformat()}", | |
| ) | |
| print(f">> Model pushed: https://huggingface.co/{ALIGNED_MODEL_ID}") | |