File size: 5,068 Bytes
7b02281
0462c81
2636834
0462c81
413d0ff
 
 
 
 
 
 
 
a7bd5fa
0462c81
 
 
2636834
a7bd5fa
2636834
 
 
413d0ff
a7bd5fa
0462c81
 
 
413d0ff
 
 
 
 
 
0462c81
 
413d0ff
 
 
7b02281
0462c81
413d0ff
0462c81
 
 
413d0ff
0462c81
 
413d0ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0462c81
413d0ff
0462c81
 
413d0ff
 
 
0462c81
413d0ff
 
 
0462c81
 
413d0ff
0462c81
 
 
 
 
 
 
413d0ff
0462c81
413d0ff
 
0462c81
 
a7bd5fa
413d0ff
0462c81
413d0ff
0462c81
 
413d0ff
0462c81
2636834
413d0ff
 
 
 
 
 
 
 
 
 
7b02281
0462c81
 
 
413d0ff
0462c81
413d0ff
0462c81
 
 
 
 
 
413d0ff
0462c81
 
 
 
 
413d0ff
0462c81
 
 
 
 
2636834
413d0ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import gradio as gr
from huggingface_hub import login
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    pipeline,
)

# ============================================================
# 🔐 Autenticación segura con tu token
# ============================================================
hf_token = os.environ.get("HF_TOKEN")

if hf_token:
    login(token=hf_token)
else:
    print("⚠️ No se encontró el token. Agrega 'HF_TOKEN' en Settings → Secrets → Add new secret")

# ============================================================
# ⚙️ Configuración del modelo base y dataset
# ============================================================
MODEL_NAME = "bigcode/santacoder"   # Modelo libre y compatible con Hugging Face
DATASET_PATH = "dataset.json"        # Archivo dataset que subiste al Space
OUTPUT_DIR = "lora_output"           # Carpeta donde se guarda el modelo entrenado

# Crear carpeta de salida si no existe
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Cargar modelo y tokenizer
print("🔄 Cargando modelo base...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=hf_token)

# ============================================================
# 🧩 Función de entrenamiento LoRA (simple y funcional)
# ============================================================
def train_lora(epochs, batch_size, learning_rate):
    try:
        # Cargar dataset JSON
        dataset = load_dataset("json", data_files=DATASET_PATH)

        # Tokenización del dataset
        def tokenize_fn(example):
            text = example["prompt"] + example["completion"]
            return tokenizer(
                text,
                truncation=True,
                padding="max_length",
                max_length=256,
            )

        tokenized = dataset.map(tokenize_fn, batched=True)

        # Preparar data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=False
        )

        # Configuración del entrenamiento
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            per_device_train_batch_size=int(batch_size),
            num_train_epochs=int(epochs),
            learning_rate=float(learning_rate),
            logging_steps=10,
            save_total_limit=1,
            push_to_hub=False,
            report_to="none",
        )

        # Entrenador
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized["train"],
            data_collator=data_collator,
        )

        # Entrenar modelo
        trainer.train()

        # Guardar resultados
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)

        return "✅ Entrenamiento completado con éxito. Modelo guardado en ./lora_output"
    except Exception as e:
        return f"❌ Error durante el entrenamiento: {str(e)}"

# ============================================================
# 🤖 Función de prueba del modelo entrenado
# ============================================================
def generate_text(prompt):
    try:
        generator = pipeline(
            "text-generation",
            model=OUTPUT_DIR,
            tokenizer=tokenizer,
        )
        output = generator(prompt, max_new_tokens=100, temperature=0.7, top_p=0.9)
        return output[0]["generated_text"]
    except Exception as e:
        return f"⚠️ Error al generar texto: {str(e)}"

# ============================================================
# 💻 Interfaz de usuario (Gradio)
# ============================================================
with gr.Blocks(title="💙 AmorCoderAI - Entrenamiento LoRA") as demo:
    gr.Markdown("# 💙 AmorCoderAI - Entrenamiento y Pruebas")
    gr.Markdown("Entrena y prueba tu modelo basado en `bigcode/santacoder` con LoRA.")

    with gr.Tab("🧠 Entrenar"):
        epochs = gr.Number(value=1, label="Épocas")
        batch_size = gr.Number(value=2, label="Tamaño de lote")
        learning_rate = gr.Number(value=5e-5, label="Tasa de aprendizaje")
        train_button = gr.Button("🚀 Iniciar entrenamiento")
        train_output = gr.Textbox(label="Resultado", lines=3)
        train_button.click(train_lora, inputs=[epochs, batch_size, learning_rate], outputs=train_output)

    with gr.Tab("✨ Probar modelo"):
        prompt = gr.Textbox(label="Escribe un prompt")
        generate_button = gr.Button("💬 Generar texto")
        output_box = gr.Textbox(label="Salida generada", lines=6)
        generate_button.click(generate_text, inputs=prompt, outputs=output_box)

# ============================================================
# 🚀 Lanzar app
# ============================================================
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)