Andro0s commited on
Commit
bafda90
·
verified ·
1 Parent(s): f67197f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -13
app.py CHANGED
@@ -1,13 +1,105 @@
1
- runtime error
2
- Exit code: 1. Reason: File "/home/user/app/app.py", line 3
3
- def fibonacci(n):Ap p.KZ.A pBApp ppvertApKert ppvAADB_ZA. pr prtoolsAppP.ADB view::EmKDBEmABP.A.ValuesEmBase.b pDB_comAppDBKtoolsDBAADB.BView.ScriptsDBAvalertCurrentDBKDBABDBAAImpl viewABaseDBCurrentBImplDBBDB.b::DBBaseDBDBADBBviewb::DB.DBA.ert.A_BaseValuesADB.DBADBDBDBBDBImplDB_DBAerDBDBABaseDB.privateDBImplBaseEmBaseDBBase::DBADBB
4
- ^
5
- SyntaxError: invalid syntax
6
- Container logs:
7
-
8
- ===== Application Startup at 2025-10-21 06:08:54 =====
9
-
10
- File "/home/user/app/app.py", line 3
11
- def fibonacci(n):Ap p.KZ.A pBApp ppvertApKert ppvAADB_ZA. pr prtoolsAppP.ADB view::EmKDBEmABP.A.ValuesEmBase.b pDB_comAppDBKtoolsDBAADB.BView.ScriptsDBAvalertCurrentDBKDBABDBAAImpl viewABaseDBCurrentBImplDBBDB.b::DBBaseDBDBADBBviewb::DB.DBA.ert.A_BaseValuesADB.DBADBDBDBBDBImplDB_DBAerDBDBABaseDB.privateDBImplBaseEmBaseDBBase::DBADBB
12
- ^
13
- SyntaxError: invalid syntax
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from huggingface_hub import login
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
6
+ from peft import get_peft_model, LoraConfig, TaskType, PeftModel
7
+ import json
8
+
9
+ # ============================================================
10
+ # ⚙️ CONFIGURACIÓN GLOBAL
11
+ # ============================================================
12
+ BASE_MODEL = "bigcode/santacoder"
13
+ LORA_PATH = "./lora_output"
14
+ DATASET_FILE = "codesearchnet_lora_dataset.json"
15
+ MAX_TOKEN_LENGTH = 256
16
+ NUM_SAMPLES_TO_PROCESS = 5000
17
+ DEFAULT_EPOCHS = 5 # <--- ¡ENTRENAMIENTO PROFUNDO!
18
+
19
+ # Variables globales
20
+ tokenizer = None
21
+ lora_model = None
22
+ tokenized_dataset = None
23
+ lora_generator = None
24
+
25
+ # ============================================================
26
+ # 🚨 LÓGICA DE PRE-PROCESAMIENTO DE DATOS (INTEGRADA) 🚨
27
+ # ============================================================
28
+ def prepare_codesearchnet():
29
+ """Descarga, procesa y guarda el dataset CodeSearchNet si no existe."""
30
+ if os.path.exists(DATASET_FILE):
31
+ print(f"✅ Dataset '{DATASET_FILE}' ya existe. Cargando directamente.")
32
+ return
33
+
34
+ print(f"🔄 Dataset no encontrado. Iniciando descarga y pre-procesamiento de CodeSearchNet ({NUM_SAMPLES_TO_PROCESS} muestras)...")
35
+
36
+ try:
37
+ raw_csn = load_dataset('Nan-Do/code-search-net-python', split=f'train[:{NUM_SAMPLES_TO_PROCESS}]')
38
+
39
+ def format_for_lora(example):
40
+ prompt_text = (
41
+ f"# Descripción: {example['docstring_summary']}\n"
42
+ f"# Completa la siguiente función:\n"
43
+ f"def {example['func_name']}("
44
+ )
45
+ completion_text = example['code']
46
+
47
+ return {
48
+ "prompt": prompt_text,
49
+ "completion": completion_text
50
+ }
51
+
52
+ lora_dataset = raw_csn.map(
53
+ format_for_lora,
54
+ batched=False,
55
+ remove_columns=raw_csn["train"].column_names,
56
+ )
57
+
58
+ lora_dataset.to_json(DATASET_FILE)
59
+ print(f"✅ Pre-procesamiento completado. {NUM_SAMPLES_TO_PROCESS} ejemplos guardados en '{DATASET_FILE}'.")
60
+
61
+ except Exception as e:
62
+ print(f"❌ Error CRÍTICO al descargar/procesar CodeSearchNet. Error: {e}")
63
+ minimal_dataset = [{"prompt": "# Error de carga. Intenta de nuevo.", "completion": "pass\n"}] * 10
64
+ with open(DATASET_FILE, 'w') as f:
65
+ json.dump(minimal_dataset, f)
66
+
67
+ # ============================================================
68
+ # 🔐 AUTENTICACIÓN Y PRE-CARGA DE RECURSOS (SINGLETON)
69
+ # ============================================================
70
+
71
+ def setup_resources():
72
+ """Carga y configura todos los recursos (modelo, tokenizer, dataset) una sola vez."""
73
+ global tokenizer, lora_model, tokenized_dataset
74
+
75
+ # 🛑 1. PREPARA EL DATASET DE CODESEARCHNET ANTES DE INTENTAR CARGARLO
76
+ prepare_codesearchnet()
77
+
78
+ # 2. Autenticación con Hugging Face
79
+ hf_token = os.environ.get("HF_TOKEN")
80
+ if hf_token:
81
+ login(token=hf_token)
82
+
83
+ # 3. Carga del Tokenizer y Modelo Base
84
+ print("\n🔄 Cargando modelo y tokenizer una sola vez...")
85
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
86
+ base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")
87
+
88
+ if tokenizer.pad_token is None:
89
+ tokenizer.pad_token = tokenizer.eos_token
90
+
91
+ # 4. Configuración y Aplicación LoRA (PEFT)
92
+ peft_config = LoraConfig(
93
+ task_type=TaskType.CAUSAL_LM,
94
+ r=8,
95
+ lora_alpha=32,
96
+ lora_dropout=0.1,
97
+ target_modules=["c_proj", "c_attn"],
98
+ )
99
+ lora_model = get_peft_model(base_model, peft_config)
100
+ print(f"✅ Modelo LoRA preparado. Parámetros entrenables: {lora_model.print_trainable_parameters()}")
101
+
102
+ # 5. Carga y Tokenización del Dataset
103
+ print(f"📚 Cargando y tokenizando dataset de: {DATASET_FILE}...")
104
+ try:
105
+ raw_dataset = load_dataset("json", data_files=DATASET_FILE)