Mauricio-100 commited on
Commit
f9a4057
·
verified ·
1 Parent(s): 12e7de3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from pathlib import Path
4
+ from datetime import datetime
5
+
6
+ from datasets import load_dataset, DatasetDict
7
+ from huggingface_hub import login, create_repo, upload_folder
8
+ from transformers import (
9
+ AutoTokenizer,
10
+ AutoModelForCausalLM,
11
+ DataCollatorForSeq2Seq,
12
+ TrainingArguments,
13
+ Trainer,
14
+ )
15
+ from peft import LoraConfig, get_peft_model
16
+
17
+ # ------------- Config utilisateur -------------
18
+ HF_TOKEN = os.environ["HF_TOKEN"] # ⚠️ récupère ton token depuis l'environnement
19
+ BASE_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf"
20
+ ALIGNED_MODEL_ID = "Gopu-poss/gopu-agent-2k-fdf-aligned"
21
+ DATASET_REPO_ID = "Gopu-poss/gopu-agent-2k-fdf-dataset-prepared"
22
+ OUTPUT_DIR = "./gopu-lora-out"
23
+ SEED = 42
24
+
25
+ # ------------- Auth -------------
26
+ print(">> Logging into Hugging Face Hub...")
27
+ login(token=HF_TOKEN)
28
+
29
+ # ------------- Chargement des datasets bruts -------------
30
+ print(">> Loading HuggingFaceFW/finewiki (train split)...")
31
+ finewiki = load_dataset("HuggingFaceFW/finewiki", split="train")
32
+ print(">> Loading fka/awesome-chatgpt-prompts (train split)...")
33
+ awesome = load_dataset("fka/awesome-chatgpt-prompts", split="train")
34
+
35
+ # ------------- Échantillonnage / préparation -------------
36
+ FW_SAMPLE_SIZE = 20000
37
+ if len(finewiki) > FW_SAMPLE_SIZE:
38
+ finewiki = finewiki.shuffle(seed=SEED).select(range(FW_SAMPLE_SIZE))
39
+
40
+ print(f">> finewiki sampled: {len(finewiki)} rows; awesome: {len(awesome)} rows")
41
+
42
+ # ------------- Normalisation en instruction / input / output -------------
43
+ def map_finewiki(example):
44
+ title = example.get("title", "")
45
+ text = example.get("text", "")
46
+ instruction = f"Explique en termes clairs et techniques l'article: {title}"
47
+ input_ctx = text[:2000]
48
+ output = (
49
+ "Résumé technique et stylisé (GopuOS): "
50
+ "Points clés, concepts, et relations. Maintiens un ton clair, concis, et agentique."
51
+ )
52
+ return {"instruction": instruction, "input": input_ctx, "output": output}
53
+
54
+ finewiki_mapped = finewiki.map(map_finewiki)
55
+
56
+ def map_awesome(example):
57
+ act = example.get("act", "")
58
+ prompt = example.get("prompt", "")
59
+ instruction = f"Rôle/acte: {act}. Réponds au prompt en style GopuOS."
60
+ input_ctx = prompt
61
+ output = (
62
+ "Réponse alignée GopuOS: concise, technique, introspectable, bilingue possible FR/EN."
63
+ )
64
+ return {"instruction": instruction, "input": input_ctx, "output": output}
65
+
66
+ awesome_mapped = awesome.map(map_awesome)
67
+
68
+ prepared = DatasetDict({
69
+ "train": finewiki_mapped,
70
+ "eval": awesome_mapped
71
+ })
72
+
73
+ # ------------- Sauvegarde locale du dataset prétraité -------------
74
+ prepared_dir = Path("./prepared_dataset")
75
+ prepared_dir.mkdir(parents=True, exist_ok=True)
76
+ for split in prepared.keys():
77
+ out_path = prepared_dir / f"{split}.jsonl"
78
+ with out_path.open("w", encoding="utf-8") as f:
79
+ for ex in prepared[split]:
80
+ f.write(json.dumps(ex, ensure_ascii=False) + "\n")
81
+
82
+ # ------------- Push du dataset prétraité sur le Hub -------------
83
+ print(f">> Creating/updating dataset repo: {DATASET_REPO_ID}")
84
+ create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", private=False, exist_ok=True)
85
+
86
+ upload_folder(
87
+ repo_id=DATASET_REPO_ID,
88
+ repo_type="dataset",
89
+ folder_path=str(prepared_dir),
90
+ token=HF_TOKEN,
91
+ commit_message=f"Prepared dataset push {datetime.utcnow().isoformat()}",
92
+ )
93
+
94
+ # ------------- Chargement modèle/tokenizer -------------
95
+ print(f">> Loading base model: {BASE_MODEL_ID}")
96
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN)
97
+ model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_ID, use_auth_token=HF_TOKEN)
98
+
99
+ # ------------- PEFT LoRA config -------------
100
+ peft_config = LoraConfig(
101
+ r=8,
102
+ lora_alpha=16,
103
+ target_modules=["q_proj", "v_proj"],
104
+ lora_dropout=0.05,
105
+ bias="none",
106
+ task_type="CAUSAL_LM",
107
+ )
108
+ model = get_peft_model(model, peft_config)
109
+
110
+ # ------------- Tokenization -------------
111
+ def format_example(ex):
112
+ system_prompt = (
113
+ "Tu es Gopu, agent intelligent de GopuOS. Réponds de manière claire, technique, stylisée, et introspectable."
114
+ )
115
+ user = f"Utilisateur: {ex['instruction']}\nContexte: {ex['input']}\nGopu:"
116
+ target = ex["output"]
117
+ src = system_prompt + "\n\n" + user
118
+ return {"src": src, "tgt": target}
119
+
120
+ formatted = prepared.map(format_example)
121
+
122
+ def tokenize(batch):
123
+ model_inputs = tokenizer(
124
+ batch["src"],
125
+ truncation=True,
126
+ max_length=1024,
127
+ )
128
+ with tokenizer.as_target_tokenizer():
129
+ labels = tokenizer(
130
+ batch["tgt"],
131
+ truncation=True,
132
+ max_length=256,
133
+ )
134
+ model_inputs["labels"] = labels["input_ids"]
135
+ return model_inputs
136
+
137
+ tokenized_train = formatted["train"].map(tokenize, batched=False, remove_columns=formatted["train"].column_names)
138
+ tokenized_eval = formatted["eval"].map(tokenize, batched=False, remove_columns=formatted["eval"].column_names)
139
+
140
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding="longest")
141
+
142
+ # ------------- Entraînement -------------
143
+ training_args = TrainingArguments(
144
+ output_dir=OUTPUT_DIR,
145
+ per_device_train_batch_size=4,
146
+ per_device_eval_batch_size=4,
147
+ gradient_accumulation_steps=2,
148
+ eval_strategy="steps",
149
+ eval_steps=200,
150
+ logging_steps=50,
151
+ save_steps=500,
152
+ save_total_limit=2,
153
+ num_train_epochs=1,
154
+ learning_rate=2e-4,
155
+ warmup_steps=200,
156
+ weight_decay=0.01,
157
+ fp16=True,
158
+ bf16=False,
159
+ report_to=[],
160
+ seed=SEED,
161
+ )
162
+
163
+ trainer = Trainer(
164
+ model=model,
165
+ args=training_args,
166
+ data_collator=data_collator,
167
+ train_dataset=tokenized_train,
168
+ eval_dataset=tokenized_eval,
169
+ )
170
+
171
+ print(">> Starting training...")
172
+ trainer.train()
173
+ print(">> Training complete")
174
+
175
+ # ------------- Sauvegarde et push du modèle -------------
176
+ print(f">> Creating/updating model repo: {ALIGNED_MODEL_ID}")
177
+ create_repo(repo_id=ALIGNED_MODEL_ID, token=HF_TOKEN, repo_type="model", private=False, exist_ok=True)
178
+
179
+ trainer.save_model(OUTPUT_DIR)
180
+ tokenizer.save_pretrained(OUTPUT_DIR)
181
+
182
+ upload_folder(
183
+ repo_id=ALIGNED_MODEL_ID,
184
+ repo_type="model",
185
+ folder_path=OUTPUT_DIR,
186
+ token=HF_TOKEN,
187
+ commit_message=f"Push aligned LoRA model {datetime.utcnow().isoformat()}",
188
+ )
189
+
190
+ print(f">> Model pushed: https://huggingface.co/{ALIGNED_MODEL_ID}")