Andro0s commited on
Commit
56a01e3
·
verified ·
1 Parent(s): 15a36a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -240
app.py CHANGED
@@ -1,240 +1,3 @@
1
- import os
2
- import gradio as gr
3
- from huggingface_hub import login
4
- from datasets import load_dataset
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
6
- from peft import get_peft_model, LoraConfig, TaskType, PeftModel
7
- import json
8
-
9
- # ============================================================
10
- # ⚙️ CONFIGURACIÓN GLOBAL
11
- # ============================================================
12
- BASE_MODEL = "bigcode/santacoder"
13
- LORA_PATH = "./lora_output"
14
- DATASET_FILE = "codesearchnet_lora_dataset.json"
15
- MAX_TOKEN_LENGTH = 256
16
- NUM_SAMPLES_TO_PROCESS = 5000
17
-
18
- # Variables globales
19
- tokenizer = None
20
- lora_model = None
21
- tokenized_dataset = None
22
- lora_generator = None
23
-
24
- # ============================================================
25
- # 🚨 LÓGICA DE PRE-PROCESAMIENTO DE DATOS (INTEGRADA) 🚨
26
- # ============================================================
27
- def prepare_codesearchnet():
28
- """Descarga, procesa y guarda el dataset CodeSearchNet si no existe."""
29
- if os.path.exists(DATASET_FILE):
30
- print(f"✅ Dataset '{DATASET_FILE}' ya existe. Cargando directamente.")
31
- return
32
-
33
- print(f"🔄 Dataset no encontrado. Iniciando descarga y pre-procesamiento de CodeSearchNet ({NUM_SAMPLES_TO_PROCESS} muestras)...")
34
-
35
- try:
36
- raw_csn = load_dataset('Nan-Do/code-search-net-python', split=f'train[:{NUM_SAMPLES_TO_PROCESS}]')
37
-
38
- def format_for_lora(example):
39
- prompt_text = (
40
- f"# Descripción: {example['docstring_summary']}\n"
41
- f"# Completa la siguiente función:\n"
42
- f"def {example['func_name']}("
43
- )
44
- completion_text = example['code']
45
-
46
- return {
47
- "prompt": prompt_text,
48
- "completion": completion_text
49
- }
50
-
51
- lora_dataset = raw_csn.map(
52
- format_for_lora,
53
- batched=False, # Importante: batched=False para evitar problemas de memoria en móvil
54
- remove_columns=raw_csn["train"].column_names,
55
- )
56
-
57
- lora_dataset.to_json(DATASET_FILE)
58
- print(f"✅ Pre-procesamiento completado. {NUM_SAMPLES_TO_PROCESS} ejemplos guardados en '{DATASET_FILE}'.")
59
-
60
- except Exception as e:
61
- print(f"❌ Error CRÍTICO al descargar/procesar CodeSearchNet. Error: {e}")
62
- # Crea un JSON mínimo para evitar errores de ruta en el siguiente paso
63
- minimal_dataset = [{"prompt": "# Error de carga. Intenta de nuevo.", "completion": "pass\n"}] * 10
64
- with open(DATASET_FILE, 'w') as f:
65
- json.dump(minimal_dataset, f)
66
-
67
- # ============================================================
68
- # 🔐 AUTENTICACIÓN Y PRE-CARGA DE RECURSOS (SINGLETON)
69
- # ============================================================
70
-
71
- def setup_resources():
72
- """Carga y configura todos los recursos (modelo, tokenizer, dataset) una sola vez."""
73
- global tokenizer, lora_model, tokenized_dataset
74
-
75
- # 🛑 1. PREPARA EL DATASET DE CODESEARCHNET ANTES DE INTENTAR CARGARLO
76
- prepare_codesearchnet()
77
-
78
- # 2. Autenticación con Hugging Face
79
- hf_token = os.environ.get("HF_TOKEN")
80
- if hf_token:
81
- login(token=hf_token)
82
-
83
- # 3. Carga del Tokenizer y Modelo Base
84
- print("\n🔄 Cargando modelo y tokenizer una sola vez...")
85
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
86
- base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
87
-
88
- if tokenizer.pad_token is None:
89
- tokenizer.pad_token = tokenizer.eos_token
90
-
91
- # 4. Configuración y Aplicación LoRA (PEFT)
92
- peft_config = LoraConfig(
93
- task_type=TaskType.CAUSAL_LM,
94
- r=8,
95
- lora_alpha=32,
96
- lora_dropout=0.1,
97
- target_modules=["c_proj", "c_attn"],
98
- )
99
- lora_model = get_peft_model(base_model, peft_config)
100
- print(f"✅ Modelo LoRA preparado. Parámetros entrenables: {lora_model.print_trainable_parameters()}")
101
-
102
- # 5. Carga y Tokenización del Dataset
103
- print(f"📚 Cargando y tokenizando dataset de: {DATASET_FILE}...")
104
- try:
105
- raw_dataset = load_dataset("json", data_files=DATASET_FILE)
106
-
107
- def tokenize_function(examples):
108
- return tokenizer(
109
- examples["prompt"] + examples["completion"],
110
- truncation=True,
111
- padding="max_length",
112
- max_length=MAX_TOKEN_LENGTH
113
- )
114
-
115
- tokenized_dataset = raw_dataset.map(
116
- tokenize_function,
117
- batched=True,
118
- remove_columns=raw_dataset["train"].column_names if "train" in raw_dataset else [],
119
- )
120
- print("✅ Dataset tokenizado correctamente.")
121
- except Exception as e:
122
- tokenized_dataset = None
123
- print(f"❌ Error al cargar o tokenizar el dataset. {e}")
124
-
125
-
126
- # ============================================================
127
- # 🧩 FUNCIÓN DE ENTRENAMIENTO
128
- # ============================================================
129
- def train_lora(epochs, batch_size, learning_rate):
130
- """Ejecuta el entrenamiento del modelo LoRA."""
131
- global lora_model, tokenized_dataset, lora_generator
132
-
133
- if tokenized_dataset is None or "train" not in tokenized_dataset:
134
- return f"❌ Error: El dataset no pudo cargarse o está vacío. No se puede entrenar."
135
-
136
- try:
137
- lora_generator = None
138
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
139
-
140
- training_args = TrainingArguments(
141
- output_dir=LORA_PATH,
142
- per_device_train_batch_size=int(batch_size),
143
- num_train_epochs=float(epochs),
144
- learning_rate=float(learning_rate),
145
- save_total_limit=1,
146
- logging_steps=10,
147
- push_to_hub=False,
148
- )
149
-
150
- trainer = Trainer(
151
- model=lora_model,
152
- args=training_args,
153
- train_dataset=tokenized_dataset["train"],
154
- data_collator=data_collator,
155
- )
156
-
157
- trainer.train()
158
-
159
- lora_model.save_pretrained(LORA_PATH)
160
- tokenizer.save_pretrained(LORA_PATH)
161
-
162
- return f"✅ Entrenamiento completado. Adaptadores LoRA guardados en **{LORA_PATH}**"
163
- except Exception as e:
164
- return f"❌ Error durante el entrenamiento: {e}"
165
-
166
- # ============================================================
167
- # 🤖 FUNCIÓN DE GENERACIÓN (INFERENCIA)
168
- # ============================================================
169
- def generate_text(prompt_text):
170
- """Genera texto usando el modelo base + adaptadores LoRA."""
171
- global lora_generator
172
-
173
- try:
174
- if lora_generator is None:
175
- base_model_gen = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
176
-
177
- if os.path.exists(LORA_PATH):
178
- print("Cargando adaptadores LoRA...")
179
- model_with_lora = PeftModel.from_pretrained(base_model_gen, LORA_PATH)
180
- else:
181
- print("No se encontraron adaptadores LoRA. Usando modelo base.")
182
- model_with_lora = base_model_gen
183
-
184
- final_model = model_with_lora.merge_and_unload()
185
- final_model.eval()
186
-
187
- lora_generator = pipeline("text-generation", model=final_model, tokenizer=tokenizer)
188
- print("Modelo de inferencia listo.")
189
-
190
- output = lora_generator(prompt_text, max_new_tokens=150, temperature=0.7, top_p=0.9)
191
- return output[0]["generated_text"]
192
-
193
- except Exception as e:
194
- return f"❌ Error generando texto (Asegúrate de que el modelo base/LoRA esté cargado): {e}"
195
-
196
- # ============================================================
197
- # 💻 INTERFAZ GRADIO
198
- # ============================================================
199
- with gr.Blocks(title="AmorCoderAI - LoRA") as demo:
200
- gr.Markdown("# 💙 AmorCoderAI - Entrenamiento y Pruebas LoRA")
201
- gr.Markdown(f"Modelo base: `{BASE_MODEL}`. Usando **{NUM_SAMPLES_TO_PROCESS}** ejemplos de CodeSearchNet.")
202
-
203
- with gr.Tab("🧠 Entrenar (Manual)"):
204
- gr.Markdown("--- **¡CUIDADO!** El entrenamiento es lento y consume muchos recursos (VRAM/RAM). ---")
205
- epochs = gr.Number(value=1, label="Épocas", precision=0)
206
- batch_size = gr.Number(value=2, label="Tamaño de lote (ajusta según tu VRAM)", precision=0)
207
- learning_rate = gr.Number(value=5e-5, label="Tasa de aprendizaje")
208
- train_button = gr.Button("🚀 Iniciar Entrenamiento Manual")
209
- train_output = gr.Textbox(label="Resultado del Entrenamiento Manual")
210
-
211
- train_button.click(
212
- train_lora,
213
- inputs=[epochs, batch_size, learning_rate],
214
- outputs=train_output
215
- )
216
-
217
- with gr.Tab("✨ Probar modelo"):
218
- prompt = gr.Textbox(label="Escribe código (ej: 'def fibonacci(n):')", lines=4)
219
- generate_button = gr.Button("💬 Generar código")
220
- output_box = gr.Textbox(label="Salida generada", lines=10)
221
- generate_button.click(generate_text, inputs=prompt, outputs=output_box)
222
-
223
- # ============================================================
224
- # 🚀 LANZAR APP Y AUTO-ENTRENAMIENTO
225
- # ============================================================
226
- if __name__ == "__main__":
227
- setup_resources()
228
-
229
- print("\n=============================================")
230
- print(f"🤖 INICIANDO AUTO-ENTRENAMIENTO (1 Época, 2 Batch Size) usando {NUM_SAMPLES_TO_PROCESS} ejemplos")
231
- print("=============================================")
232
-
233
- auto_train_result = train_lora(epochs=1, batch_size=2, learning_rate=5e-5)
234
-
235
- print(f"\nFIN DEL AUTO-ENTRENAMIENTO: {auto_train_result}")
236
-
237
- print("\n=============================================")
238
- print("💻 LANZANDO INTERFAZ GRADIO")
239
- print("=============================================")
240
- demo.launch()
 
1
+ # Descripción: Calcula el número de Fibonacci de forma recursiva.
2
+ # Completa la siguiente función:
3
+ def fibonacci(n):Ap p.KZ.A pBApp ppvertApKert ppvAADB_ZA. pr prtoolsAppP.ADB view::EmKDBEmABP.A.ValuesEmBase.b pDB_comAppDBKtoolsDBAADB.BView.ScriptsDBAvalertCurrentDBKDBABDBAAImpl viewABaseDBCurrentBImplDBBDB.b::DBBaseDBDBADBBviewb::DB.DBA.ert.A_BaseValuesADB.DBADBDBDBBDBImplDB_DBAerDBDBABaseDB.privateDBImplBaseEmBaseDBBase::DBADBB