import os import gradio as gr from huggingface_hub import login from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline from peft import get_peft_model, LoraConfig, TaskType, PeftModel import json # ============================================================ # ⚙️ CONFIGURACIÓN GLOBAL # ============================================================ BASE_MODEL = "bigcode/santacoder" LORA_PATH = "./lora_output" DATASET_FILE = "codesearchnet_lora_dataset.json" MAX_TOKEN_LENGTH = 256 NUM_SAMPLES_TO_PROCESS = 5000 DEFAULT_EPOCHS = 5 # <--- ¡ENTRENAMIENTO PROFUNDO! # Variables globales tokenizer = None lora_model = None tokenized_dataset = None lora_generator = None # ============================================================ # 🚨 LÓGICA DE PRE-PROCESAMIENTO DE DATOS (INTEGRADA) 🚨 # ============================================================ def prepare_codesearchnet(): """Descarga, procesa y guarda el dataset CodeSearchNet si no existe.""" if os.path.exists(DATASET_FILE): print(f"✅ Dataset '{DATASET_FILE}' ya existe. Cargando directamente.") return print(f"🔄 Dataset no encontrado. Iniciando descarga y pre-procesamiento de CodeSearchNet ({NUM_SAMPLES_TO_PROCESS} muestras)...") try: raw_csn = load_dataset('Nan-Do/code-search-net-python', split=f'train[:{NUM_SAMPLES_TO_PROCESS}]') def format_for_lora(example): prompt_text = ( f"# Descripción: {example['docstring_summary']}\n" f"# Completa la siguiente función:\n" f"def {example['func_name']}(" ) completion_text = example['code'] return { "prompt": prompt_text, "completion": completion_text } lora_dataset = raw_csn.map( format_for_lora, batched=False, remove_columns=raw_csn["train"].column_names, ) lora_dataset.to_json(DATASET_FILE) print(f"✅ Pre-procesamiento completado. {NUM_SAMPLES_TO_PROCESS} ejemplos guardados en '{DATASET_FILE}'.") except Exception as e: print(f"❌ Error CRÍTICO al descargar/procesar CodeSearchNet. Error: {e}") minimal_dataset = [{"prompt": "# Error de carga. Intenta de nuevo.", "completion": "pass\n"}] * 10 with open(DATASET_FILE, 'w') as f: json.dump(minimal_dataset, f) # ============================================================ # 🔐 AUTENTICACIÓN Y PRE-CARGA DE RECURSOS (SINGLETON) # ============================================================ def setup_resources(): """Carga y configura todos los recursos (modelo, tokenizer, dataset) una sola vez.""" global tokenizer, lora_model, tokenized_dataset # 🛑 1. PREPARA EL DATASET DE CODESEARCHNET ANTES DE INTENTAR CARGARLO prepare_codesearchnet() # 2. Autenticación con Hugging Face hf_token = os.environ.get("HF_TOKEN") if hf_token: login(token=hf_token) # 3. Carga del Tokenizer y Modelo Base print("\n🔄 Cargando modelo y tokenizer una sola vez...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto") if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 4. Configuración y Aplicación LoRA (PEFT) peft_config = LoraConfig( task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["c_proj", "c_attn"], ) lora_model = get_peft_model(base_model, peft_config) print(f"✅ Modelo LoRA preparado. Parámetros entrenables: {lora_model.print_trainable_parameters()}") # 5. Carga y Tokenización del Dataset print(f"📚 Cargando y tokenizando dataset de: {DATASET_FILE}...") try: raw_dataset = load_dataset("json", data_files=DATASET_FILE)