Train-post / app.py
Mauricio-100's picture
Create app.py
f9a4057 verified
raw
history blame
6.29 kB
import os
import json
from pathlib import Path
from datetime import datetime
from datasets import load_dataset, DatasetDict
from huggingface_hub import login, create_repo, upload_folder
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
DataCollatorForSeq2Seq,
TrainingArguments,
Trainer,
)
from peft import LoraConfig, get_peft_model
# ------------- Config utilisateur -------------
HF_TOKEN = os.environ["HF_TOKEN"] # ⚠️ récupère ton token depuis l'environnement
BASE_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf"
ALIGNED_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf-aligned"
DATASET_REPO_ID = "Gopu-poss/gopu-agent-2k-fdf-dataset-prepared"
OUTPUT_DIR = "./gopu-lora-out"
SEED = 42
# ------------- Auth -------------
print(">> Logging into Hugging Face Hub...")
login(token=HF_TOKEN)
# ------------- Chargement des datasets bruts -------------
print(">> Loading HuggingFaceFW/finewiki (train split)...")
finewiki = load_dataset("HuggingFaceFW/finewiki", split="train")
print(">> Loading fka/awesome-chatgpt-prompts (train split)...")
awesome = load_dataset("fka/awesome-chatgpt-prompts", split="train")
# ------------- Échantillonnage / préparation -------------
FW_SAMPLE_SIZE = 20000
if len(finewiki) > FW_SAMPLE_SIZE:
finewiki = finewiki.shuffle(seed=SEED).select(range(FW_SAMPLE_SIZE))
print(f">> finewiki sampled: {len(finewiki)} rows; awesome: {len(awesome)} rows")
# ------------- Normalisation en instruction / input / output -------------
def map_finewiki(example):
title = example.get("title", "")
text = example.get("text", "")
instruction = f"Explique en termes clairs et techniques l'article: {title}"
input_ctx = text[:2000]
output = (
"Résumé technique et stylisé (GopuOS): "
"Points clés, concepts, et relations. Maintiens un ton clair, concis, et agentique."
)
return {"instruction": instruction, "input": input_ctx, "output": output}
finewiki_mapped = finewiki.map(map_finewiki)
def map_awesome(example):
act = example.get("act", "")
prompt = example.get("prompt", "")
instruction = f"Rôle/acte: {act}. Réponds au prompt en style GopuOS."
input_ctx = prompt
output = (
"Réponse alignée GopuOS: concise, technique, introspectable, bilingue possible FR/EN."
)
return {"instruction": instruction, "input": input_ctx, "output": output}
awesome_mapped = awesome.map(map_awesome)
prepared = DatasetDict({
"train": finewiki_mapped,
"eval": awesome_mapped
})
# ------------- Sauvegarde locale du dataset prétraité -------------
prepared_dir = Path("./prepared_dataset")
prepared_dir.mkdir(parents=True, exist_ok=True)
for split in prepared.keys():
out_path = prepared_dir / f"{split}.jsonl"
with out_path.open("w", encoding="utf-8") as f:
for ex in prepared[split]:
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
# ------------- Push du dataset prétraité sur le Hub -------------
print(f">> Creating/updating dataset repo: {DATASET_REPO_ID}")
create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", private=False, exist_ok=True)
upload_folder(
repo_id=DATASET_REPO_ID,
repo_type="dataset",
folder_path=str(prepared_dir),
token=HF_TOKEN,
commit_message=f"Prepared dataset push {datetime.utcnow().isoformat()}",
)
# ------------- Chargement modèle/tokenizer -------------
print(f">> Loading base model: {BASE_MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN)
# ------------- PEFT LoRA config -------------
peft_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)
# ------------- Tokenization -------------
def format_example(ex):
system_prompt = (
"Tu es Gopu, agent intelligent de GopuOS. Réponds de manière claire, technique, stylisée, et introspectable."
)
user = f"Utilisateur: {ex['instruction']}\nContexte: {ex['input']}\nGopu:"
target = ex["output"]
src = system_prompt + "\n\n" + user
return {"src": src, "tgt": target}
formatted = prepared.map(format_example)
def tokenize(batch):
model_inputs = tokenizer(
batch["src"],
truncation=True,
max_length=1024,
)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
batch["tgt"],
truncation=True,
max_length=256,
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized_train = formatted["train"].map(tokenize, batched=False, remove_columns=formatted["train"].column_names)
tokenized_eval = formatted["eval"].map(tokenize, batched=False, remove_columns=formatted["eval"].column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")
# ------------- Entraînement -------------
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=2,
eval_strategy="steps",
eval_steps=200,
logging_steps=50,
save_steps=500,
save_total_limit=2,
num_train_epochs=1,
learning_rate=2e-4,
warmup_steps=200,
weight_decay=0.01,
fp16=True,
bf16=False,
report_to=[],
seed=SEED,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_train,
eval_dataset=tokenized_eval,
)
print(">> Starting training...")
trainer.train()
print(">> Training complete")
# ------------- Sauvegarde et push du modèle -------------
print(f">> Creating/updating model repo: {ALIGNED_MODEL_ID}")
create_repo(repo_id=ALIGNED_MODEL_ID, token=HF_TOKEN, repo_type="model", private=False, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
upload_folder(
repo_id=ALIGNED_MODEL_ID,
repo_type="model",
folder_path=OUTPUT_DIR,
token=HF_TOKEN,
commit_message=f"Push aligned LoRA model {datetime.utcnow().isoformat()}",
)
print(f">> Model pushed: https://huggingface.co/{ALIGNED_MODEL_ID}")