Spaces:
Runtime error
Runtime error
File size: 6,294 Bytes
f9a4057 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import os
import json
from pathlib import Path
from datetime import datetime
from datasets import load_dataset, DatasetDict
from huggingface_hub import login, create_repo, upload_folder
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
DataCollatorForSeq2Seq,
TrainingArguments,
Trainer,
)
from peft import LoraConfig, get_peft_model
# ------------- Config utilisateur -------------
HF_TOKEN = os.environ["HF_TOKEN"] # ⚠️ récupère ton token depuis l'environnement
BASE_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf"
ALIGNED_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf-aligned"
DATASET_REPO_ID = "Gopu-poss/gopu-agent-2k-fdf-dataset-prepared"
OUTPUT_DIR = "./gopu-lora-out"
SEED = 42
# ------------- Auth -------------
print(">> Logging into Hugging Face Hub...")
login(token=HF_TOKEN)
# ------------- Chargement des datasets bruts -------------
print(">> Loading HuggingFaceFW/finewiki (train split)...")
finewiki = load_dataset("HuggingFaceFW/finewiki", split="train")
print(">> Loading fka/awesome-chatgpt-prompts (train split)...")
awesome = load_dataset("fka/awesome-chatgpt-prompts", split="train")
# ------------- Échantillonnage / préparation -------------
FW_SAMPLE_SIZE = 20000
if len(finewiki) > FW_SAMPLE_SIZE:
finewiki = finewiki.shuffle(seed=SEED).select(range(FW_SAMPLE_SIZE))
print(f">> finewiki sampled: {len(finewiki)} rows; awesome: {len(awesome)} rows")
# ------------- Normalisation en instruction / input / output -------------
def map_finewiki(example):
title = example.get("title", "")
text = example.get("text", "")
instruction = f"Explique en termes clairs et techniques l'article: {title}"
input_ctx = text[:2000]
output = (
"Résumé technique et stylisé (GopuOS): "
"Points clés, concepts, et relations. Maintiens un ton clair, concis, et agentique."
)
return {"instruction": instruction, "input": input_ctx, "output": output}
finewiki_mapped = finewiki.map(map_finewiki)
def map_awesome(example):
act = example.get("act", "")
prompt = example.get("prompt", "")
instruction = f"Rôle/acte: {act}. Réponds au prompt en style GopuOS."
input_ctx = prompt
output = (
"Réponse alignée GopuOS: concise, technique, introspectable, bilingue possible FR/EN."
)
return {"instruction": instruction, "input": input_ctx, "output": output}
awesome_mapped = awesome.map(map_awesome)
prepared = DatasetDict({
"train": finewiki_mapped,
"eval": awesome_mapped
})
# ------------- Sauvegarde locale du dataset prétraité -------------
prepared_dir = Path("./prepared_dataset")
prepared_dir.mkdir(parents=True, exist_ok=True)
for split in prepared.keys():
out_path = prepared_dir / f"{split}.jsonl"
with out_path.open("w", encoding="utf-8") as f:
for ex in prepared[split]:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
# ------------- Push du dataset prétraité sur le Hub -------------
print(f">> Creating/updating dataset repo: {DATASET_REPO_ID}")
create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", private=False, exist_ok=True)
upload_folder(
repo_id=DATASET_REPO_ID,
repo_type="dataset",
folder_path=str(prepared_dir),
token=HF_TOKEN,
commit_message=f"Prepared dataset push {datetime.utcnow().isoformat()}",
)
# ------------- Chargement modèle/tokenizer -------------
print(f">> Loading base model: {BASE_MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN)
# ------------- PEFT LoRA config -------------
peft_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)
# ------------- Tokenization -------------
def format_example(ex):
system_prompt = (
"Tu es Gopu, agent intelligent de GopuOS. Réponds de manière claire, technique, stylisée, et introspectable."
)
user = f"Utilisateur: {ex['instruction']}\nContexte: {ex['input']}\nGopu:"
target = ex["output"]
src = system_prompt + "\n\n" + user
return {"src": src, "tgt": target}
formatted = prepared.map(format_example)
def tokenize(batch):
model_inputs = tokenizer(
batch["src"],
truncation=True,
max_length=1024,
)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
batch["tgt"],
truncation=True,
max_length=256,
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_train = formatted["train"].map(tokenize, batched=False, remove_columns=formatted["train"].column_names)
tokenized_eval = formatted["eval"].map(tokenize, batched=False, remove_columns=formatted["eval"].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")
# ------------- Entraînement -------------
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=2,
eval_strategy="steps",
eval_steps=200,
logging_steps=50,
save_steps=500,
save_total_limit=2,
num_train_epochs=1,
learning_rate=2e-4,
warmup_steps=200,
weight_decay=0.01,
fp16=True,
bf16=False,
report_to=[],
seed=SEED,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print(">> Starting training...")
trainer.train()
print(">> Training complete")
# ------------- Sauvegarde et push du modèle -------------
print(f">> Creating/updating model repo: {ALIGNED_MODEL_ID}")
create_repo(repo_id=ALIGNED_MODEL_ID, token=HF_TOKEN, repo_type="model", private=False, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
upload_folder(
repo_id=ALIGNED_MODEL_ID,
repo_type="model",
folder_path=OUTPUT_DIR,
token=HF_TOKEN,
commit_message=f"Push aligned LoRA model {datetime.utcnow().isoformat()}",
)
print(f">> Model pushed: https://huggingface.co/{ALIGNED_MODEL_ID}")
|