Andro0s commited on
Commit
15a36a2
·
verified ·
1 Parent(s): 88f9913

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -5
app.py CHANGED
@@ -1,5 +1,240 @@
1
- runtime error
2
- Exit code: 1. Reason: File "/home/user/app/app.py", line 5
3
- 0%| | 0/9 [00:00<?, ?it/s]/usr/local/lib/python3.10/site-packages/torch/utils/data/dataloader.py:668: UserWarning: 'pin_memory' argument is set as true but no accelerator is found, then device pinned memory won't be used.
4
- ^
5
- SyntaxError: unterminated string literal (detected at line 5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from huggingface_hub import login
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
6
+ from peft import get_peft_model, LoraConfig, TaskType, PeftModel
7
+ import json
8
+
9
+ # ============================================================
10
+ # ⚙️ CONFIGURACIÓN GLOBAL
11
+ # ============================================================
12
+ BASE_MODEL = "bigcode/santacoder"
13
+ LORA_PATH = "./lora_output"
14
+ DATASET_FILE = "codesearchnet_lora_dataset.json"
15
+ MAX_TOKEN_LENGTH = 256
16
+ NUM_SAMPLES_TO_PROCESS = 5000
17
+
18
+ # Variables globales
19
+ tokenizer = None
20
+ lora_model = None
21
+ tokenized_dataset = None
22
+ lora_generator = None
23
+
24
+ # ============================================================
25
+ # 🚨 LÓGICA DE PRE-PROCESAMIENTO DE DATOS (INTEGRADA) 🚨
26
+ # ============================================================
27
+ def prepare_codesearchnet():
28
+ """Descarga, procesa y guarda el dataset CodeSearchNet si no existe."""
29
+ if os.path.exists(DATASET_FILE):
30
+ print(f"✅ Dataset '{DATASET_FILE}' ya existe. Cargando directamente.")
31
+ return
32
+
33
+ print(f"🔄 Dataset no encontrado. Iniciando descarga y pre-procesamiento de CodeSearchNet ({NUM_SAMPLES_TO_PROCESS} muestras)...")
34
+
35
+ try:
36
+ raw_csn = load_dataset('Nan-Do/code-search-net-python', split=f'train[:{NUM_SAMPLES_TO_PROCESS}]')
37
+
38
+ def format_for_lora(example):
39
+ prompt_text = (
40
+ f"# Descripción: {example['docstring_summary']}\n"
41
+ f"# Completa la siguiente función:\n"
42
+ f"def {example['func_name']}("
43
+ )
44
+ completion_text = example['code']
45
+
46
+ return {
47
+ "prompt": prompt_text,
48
+ "completion": completion_text
49
+ }
50
+
51
+ lora_dataset = raw_csn.map(
52
+ format_for_lora,
53
+ batched=False, # Importante: batched=False para evitar problemas de memoria en móvil
54
+ remove_columns=raw_csn["train"].column_names,
55
+ )
56
+
57
+ lora_dataset.to_json(DATASET_FILE)
58
+ print(f"✅ Pre-procesamiento completado. {NUM_SAMPLES_TO_PROCESS} ejemplos guardados en '{DATASET_FILE}'.")
59
+
60
+ except Exception as e:
61
+ print(f"❌ Error CRÍTICO al descargar/procesar CodeSearchNet. Error: {e}")
62
+ # Crea un JSON mínimo para evitar errores de ruta en el siguiente paso
63
+ minimal_dataset = [{"prompt": "# Error de carga. Intenta de nuevo.", "completion": "pass\n"}] * 10
64
+ with open(DATASET_FILE, 'w') as f:
65
+ json.dump(minimal_dataset, f)
66
+
67
+ # ============================================================
68
+ # 🔐 AUTENTICACIÓN Y PRE-CARGA DE RECURSOS (SINGLETON)
69
+ # ============================================================
70
+
71
+ def setup_resources():
72
+ """Carga y configura todos los recursos (modelo, tokenizer, dataset) una sola vez."""
73
+ global tokenizer, lora_model, tokenized_dataset
74
+
75
+ # 🛑 1. PREPARA EL DATASET DE CODESEARCHNET ANTES DE INTENTAR CARGARLO
76
+ prepare_codesearchnet()
77
+
78
+ # 2. Autenticación con Hugging Face
79
+ hf_token = os.environ.get("HF_TOKEN")
80
+ if hf_token:
81
+ login(token=hf_token)
82
+
83
+ # 3. Carga del Tokenizer y Modelo Base
84
+ print("\n🔄 Cargando modelo y tokenizer una sola vez...")
85
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
86
+ base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
87
+
88
+ if tokenizer.pad_token is None:
89
+ tokenizer.pad_token = tokenizer.eos_token
90
+
91
+ # 4. Configuración y Aplicación LoRA (PEFT)
92
+ peft_config = LoraConfig(
93
+ task_type=TaskType.CAUSAL_LM,
94
+ r=8,
95
+ lora_alpha=32,
96
+ lora_dropout=0.1,
97
+ target_modules=["c_proj", "c_attn"],
98
+ )
99
+ lora_model = get_peft_model(base_model, peft_config)
100
+ print(f"✅ Modelo LoRA preparado. Parámetros entrenables: {lora_model.print_trainable_parameters()}")
101
+
102
+ # 5. Carga y Tokenización del Dataset
103
+ print(f"📚 Cargando y tokenizando dataset de: {DATASET_FILE}...")
104
+ try:
105
+ raw_dataset = load_dataset("json", data_files=DATASET_FILE)
106
+
107
+ def tokenize_function(examples):
108
+ return tokenizer(
109
+ examples["prompt"] + examples["completion"],
110
+ truncation=True,
111
+ padding="max_length",
112
+ max_length=MAX_TOKEN_LENGTH
113
+ )
114
+
115
+ tokenized_dataset = raw_dataset.map(
116
+ tokenize_function,
117
+ batched=True,
118
+ remove_columns=raw_dataset["train"].column_names if "train" in raw_dataset else [],
119
+ )
120
+ print("✅ Dataset tokenizado correctamente.")
121
+ except Exception as e:
122
+ tokenized_dataset = None
123
+ print(f"❌ Error al cargar o tokenizar el dataset. {e}")
124
+
125
+
126
+ # ============================================================
127
+ # 🧩 FUNCIÓN DE ENTRENAMIENTO
128
+ # ============================================================
129
+ def train_lora(epochs, batch_size, learning_rate):
130
+ """Ejecuta el entrenamiento del modelo LoRA."""
131
+ global lora_model, tokenized_dataset, lora_generator
132
+
133
+ if tokenized_dataset is None or "train" not in tokenized_dataset:
134
+ return f"❌ Error: El dataset no pudo cargarse o está vacío. No se puede entrenar."
135
+
136
+ try:
137
+ lora_generator = None
138
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
139
+
140
+ training_args = TrainingArguments(
141
+ output_dir=LORA_PATH,
142
+ per_device_train_batch_size=int(batch_size),
143
+ num_train_epochs=float(epochs),
144
+ learning_rate=float(learning_rate),
145
+ save_total_limit=1,
146
+ logging_steps=10,
147
+ push_to_hub=False,
148
+ )
149
+
150
+ trainer = Trainer(
151
+ model=lora_model,
152
+ args=training_args,
153
+ train_dataset=tokenized_dataset["train"],
154
+ data_collator=data_collator,
155
+ )
156
+
157
+ trainer.train()
158
+
159
+ lora_model.save_pretrained(LORA_PATH)
160
+ tokenizer.save_pretrained(LORA_PATH)
161
+
162
+ return f"✅ Entrenamiento completado. Adaptadores LoRA guardados en **{LORA_PATH}**"
163
+ except Exception as e:
164
+ return f"❌ Error durante el entrenamiento: {e}"
165
+
166
+ # ============================================================
167
+ # 🤖 FUNCIÓN DE GENERACIÓN (INFERENCIA)
168
+ # ============================================================
169
+ def generate_text(prompt_text):
170
+ """Genera texto usando el modelo base + adaptadores LoRA."""
171
+ global lora_generator
172
+
173
+ try:
174
+ if lora_generator is None:
175
+ base_model_gen = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
176
+
177
+ if os.path.exists(LORA_PATH):
178
+ print("Cargando adaptadores LoRA...")
179
+ model_with_lora = PeftModel.from_pretrained(base_model_gen, LORA_PATH)
180
+ else:
181
+ print("No se encontraron adaptadores LoRA. Usando modelo base.")
182
+ model_with_lora = base_model_gen
183
+
184
+ final_model = model_with_lora.merge_and_unload()
185
+ final_model.eval()
186
+
187
+ lora_generator = pipeline("text-generation", model=final_model, tokenizer=tokenizer)
188
+ print("Modelo de inferencia listo.")
189
+
190
+ output = lora_generator(prompt_text, max_new_tokens=150, temperature=0.7, top_p=0.9)
191
+ return output[0]["generated_text"]
192
+
193
+ except Exception as e:
194
+ return f"❌ Error generando texto (Asegúrate de que el modelo base/LoRA esté cargado): {e}"
195
+
196
+ # ============================================================
197
+ # 💻 INTERFAZ GRADIO
198
+ # ============================================================
199
+ with gr.Blocks(title="AmorCoderAI - LoRA") as demo:
200
+ gr.Markdown("# 💙 AmorCoderAI - Entrenamiento y Pruebas LoRA")
201
+ gr.Markdown(f"Modelo base: `{BASE_MODEL}`. Usando **{NUM_SAMPLES_TO_PROCESS}** ejemplos de CodeSearchNet.")
202
+
203
+ with gr.Tab("🧠 Entrenar (Manual)"):
204
+ gr.Markdown("--- **¡CUIDADO!** El entrenamiento es lento y consume muchos recursos (VRAM/RAM). ---")
205
+ epochs = gr.Number(value=1, label="Épocas", precision=0)
206
+ batch_size = gr.Number(value=2, label="Tamaño de lote (ajusta según tu VRAM)", precision=0)
207
+ learning_rate = gr.Number(value=5e-5, label="Tasa de aprendizaje")
208
+ train_button = gr.Button("🚀 Iniciar Entrenamiento Manual")
209
+ train_output = gr.Textbox(label="Resultado del Entrenamiento Manual")
210
+
211
+ train_button.click(
212
+ train_lora,
213
+ inputs=[epochs, batch_size, learning_rate],
214
+ outputs=train_output
215
+ )
216
+
217
+ with gr.Tab("✨ Probar modelo"):
218
+ prompt = gr.Textbox(label="Escribe código (ej: 'def fibonacci(n):')", lines=4)
219
+ generate_button = gr.Button("💬 Generar código")
220
+ output_box = gr.Textbox(label="Salida generada", lines=10)
221
+ generate_button.click(generate_text, inputs=prompt, outputs=output_box)
222
+
223
+ # ============================================================
224
+ # 🚀 LANZAR APP Y AUTO-ENTRENAMIENTO
225
+ # ============================================================
226
+ if __name__ == "__main__":
227
+ setup_resources()
228
+
229
+ print("\n=============================================")
230
+ print(f"🤖 INICIANDO AUTO-ENTRENAMIENTO (1 Época, 2 Batch Size) usando {NUM_SAMPLES_TO_PROCESS} ejemplos")
231
+ print("=============================================")
232
+
233
+ auto_train_result = train_lora(epochs=1, batch_size=2, learning_rate=5e-5)
234
+
235
+ print(f"\nFIN DEL AUTO-ENTRENAMIENTO: {auto_train_result}")
236
+
237
+ print("\n=============================================")
238
+ print("💻 LANZANDO INTERFAZ GRADIO")
239
+ print("=============================================")
240
+ demo.launch()