Andro0s commited on
Commit
ce6b6c1
·
verified ·
1 Parent(s): 2176a59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -108
app.py CHANGED
@@ -1,108 +1,17 @@
1
- import os
2
- import gradio as gr
3
- from huggingface_hub import login
4
- from datasets import load_dataset
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
6
- from peft import get_peft_model, LoraConfig, TaskType, PeftModel
7
- import json
8
-
9
- # ============================================================
10
- # ⚙️ CONFIGURACIÓN GLOBAL
11
- # ============================================================
12
- # Modelo base para generación de código
13
- BASE_MODEL = "bigcode/santacoder"
14
- LORA_PATH = "./lora_output" # Directorio para guardar los adaptadores LoRA
15
-
16
- # Nombre del archivo donde se guardará el dataset procesado
17
- DATASET_FILE = "codesearchnet_lora_dataset.json"
18
- MAX_TOKEN_LENGTH = 256 # Longitud de secuencia uniforme
19
- NUM_SAMPLES_TO_PROCESS = 5000
20
- DEFAULT_EPOCHS = 5 # <--- ¡ENTRENAMIENTO PROFUNDO!
21
-
22
- # Variables globales
23
- tokenizer = None
24
- lora_model = None
25
- tokenized_dataset = None
26
- lora_generator = None
27
-
28
- # ============================================================
29
- # 🚨 LÓGICA DE PRE-PROCESAMIENTO DE DATOS (INTEGRADA) 🚨
30
- # ============================================================
31
- def prepare_codesearchnet():
32
- """Descarga, procesa y guarda el dataset CodeSearchNet si no existe."""
33
- if os.path.exists(DATASET_FILE):
34
- print(f"✅ Dataset '{DATASET_FILE}' ya existe.")
35
- return
36
-
37
- print(f"🔄 Descargando y procesando CodeSearchNet ({NUM_SAMPLES_TO_PROCESS} muestras)...")
38
-
39
- try:
40
- raw_csn = load_dataset('Nan-Do/code-search-net-python', split=f'train[:{NUM_SAMPLES_TO_PROCESS}]')
41
-
42
- def format_for_lora(example):
43
- prompt_text = (
44
- f"# Descripción: {example['docstring_summary']}\n"
45
- f"# Completa la siguiente función:\n"
46
- f"def {example['func_name']}("
47
- )
48
- completion_text = example['code']
49
-
50
- return {
51
- "prompt": prompt_text,
52
- "completion": completion_text
53
- }
54
-
55
- lora_dataset = raw_csn.map(
56
- format_for_lora,
57
- batched=False,
58
- remove_columns=raw_csn["train"].column_names,
59
- )
60
-
61
- lora_dataset.to_json(DATASET_FILE)
62
- print(f"✅ Pre-procesamiento completado. {NUM_SAMPLES_TO_PROCESS} ejemplos guardados en '{DATASET_FILE}'.")
63
-
64
- except Exception as e:
65
- print(f"❌ Error CRÍTICO al descargar/procesar CodeSearchNet. Error: {e}")
66
- minimal_dataset = [{"prompt": "# Error de carga. Intenta de nuevo.", "completion": "pass\n"}] * 10
67
- with open(DATASET_FILE, 'w') as f:
68
- json.dump(minimal_dataset, f)
69
-
70
- # ============================================================
71
- # 🔐 AUTENTICACIÓN Y PRE-CARGA DE RECURSOS (SINGLETON)
72
- # ============================================================
73
-
74
- def setup_resources():
75
- """Carga y configura todos los recursos (modelo, tokenizer, dataset) una sola vez."""
76
- global tokenizer, lora_model, tokenized_dataset
77
-
78
- prepare_codesearchnet()
79
-
80
- hf_token = os.environ.get("HF_TOKEN")
81
- if hf_token:
82
- login(token=hf_token)
83
-
84
- # 1. Carga del Tokenizer y Modelo Base
85
- print("\n🔄 Cargando modelo base y tokenizer...")
86
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
87
- base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
88
-
89
- if tokenizer.pad_token is None:
90
- tokenizer.pad_token = tokenizer.eos_token
91
-
92
- # 2. Configuración y Aplicación LoRA (PEFT)
93
- peft_config = LoraConfig(
94
- task_type=TaskType.CAUSAL_LM,
95
- r=8,
96
- lora_alpha=32,
97
- lora_dropout=0.1,
98
- target_modules=["c_proj", "c_attn"],
99
- )
100
- lora_model = get_peft_model(base_model, peft_config)
101
-
102
- # Hemos simplificado este print para evitar que se rompa
103
- print(f"✅ Modelo LoRA preparado. Parámetros entrenables listos.")
104
-
105
- # 3. Carga y Tokenización del Dataset
106
- print(f"📚 Cargando y tokenizando dataset: {DATASET_FILE}...")
107
- try:
108
- raw_dataset = load_dataset("json", data_files=DATAS
 
1
+ runtime error
2
+ Exit code: 1. Reason: File "/home/user/app/app.py", line 108
3
+ raw_dataset = load_dataset("json", data_files=DATAS
4
+ ^
5
+ SyntaxError: '(' was never closed
6
+ Container logs:
7
+
8
+ ===== Application Startup at 2025-10-21 06:25:42 =====
9
+
10
+ File "/home/user/app/app.py", line 108
11
+ raw_dataset = load_dataset("json", data_files=DATAS
12
+ ^
13
+ SyntaxError: '(' was never closed
14
+ File "/home/user/app/app.py", line 108
15
+ raw_dataset = load_dataset("json", data_files=DATAS
16
+ ^
17
+ SyntaxError: '(' was never closed