Spaces:

TeszenAI
/

MTP-3.3.1

Sleeping

App Files Files Community

teszenofficial commited on Apr 11

Commit

f5aa463

verified ·

1 Parent(s): aab0558

Update app.py

Browse files

Files changed (1) hide show

app.py +657 -172

app.py CHANGED Viewed

@@ -1,316 +1,801 @@
 import os
 import torch
-from fastapi import FastAPI
-from fastapi.responses import HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import uvicorn
-import re
-# ==================== CONFIGURACIÓN ====================
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"📱 Dispositivo: {DEVICE}")
-# Usar un modelo pequeño pero FUNCIONAL de HuggingFace
-# Opciones: "microsoft/DialoGPT-small" (mejor para conversación)
-#          "TinyLlama/TinyLlama-1.1B-Chat-v1.0" (más potente pero más lento)
-MODEL_NAME = "microsoft/DialoGPT-small"  # ~60MB, rápido y funcional
-print(f"📦 Cargando modelo {MODEL_NAME}...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
-model.eval()
-print(f"✅ Modelo cargado: {sum(p.numel() for p in model.parameters()):,} parámetros")
-# ==================== API ====================
-app = FastAPI()
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
-class PromptRequest(BaseModel):
-    text: str
-def clean_response(text: str) -> str:
     """Limpia la respuesta del modelo"""
     if not text:
         return ""
-    # Eliminar caracteres especiales
-    text = re.sub(r'<\|.*?\|>', '', text)
-    text = re.sub(r'\[.*?\]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
-    # Limitar longitud
-    if len(text) > 400:
-        text = text[:400]
-        last_dot = text.rfind('.')
-        if last_dot > 200:
-            text = text[:last_dot + 1]
-    return text if text else "Lo siento, no pude generar una respuesta."
-@app.post("/generate")
-async def generate(req: PromptRequest):
-    user_input = req.text.strip()
-    if not user_input:
-        return {"reply": "Escribe un mensaje"}
-    # Formatear entrada para el modelo
-    formatted_input = f"User: {user_input}\nBot:"
-    # Tokenizar
-    inputs = tokenizer.encode(formatted_input, return_tensors="pt").to(DEVICE)
-    # Generar
-    with torch.no_grad():
-        outputs = model.generate(
-            inputs,
-            max_new_tokens=100,
-            temperature=0.7,
-            top_k=50,
-            top_p=0.9,
-            do_sample=True,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    # Decodificar
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extraer solo la respuesta del bot
-    if "Bot:" in response:
-        response = response.split("Bot:")[-1].strip()
-    elif "User:" in response:
-        parts = response.split("User:")
-        response = parts[-1].strip() if len(parts) > 1 else response
-    response = clean_response(response)
-    print(f"📝 Usuario: {user_input[:50]}")
-    print(f"🤖 Respuesta: {response[:100]}")
-    return {"reply": response}
 @app.get("/health")
-def health():
-    return {"status": "ok"}
 @app.get("/", response_class=HTMLResponse)
 def chat_ui():
     return """
 <!DOCTYPE html>
-<html>
 <head>
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
-<title>MTP - Asistente IA</title>
 <style>
 * { margin: 0; padding: 0; box-sizing: border-box; }
 body {
-    background: #0a0a0f;
     font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
     height: 100vh;
     display: flex;
     flex-direction: column;
 }
-.header {
     padding: 16px 20px;
-    background: #1a1a2e;
-    border-bottom: 1px solid #2a2a4a;
-    text-align: center;
 }
-.header h1 { color: white; font-size: 1.3rem; }
-.header p { color: #888; font-size: 0.75rem; margin-top: 4px; }
-.chat {
     flex: 1;
     overflow-y: auto;
-    padding: 16px;
     display: flex;
     flex-direction: column;
-    gap: 10px;
 }
 .message {
     display: flex;
-    gap: 8px;
-    max-width: 85%;
-    animation: fadeIn 0.2s ease;
 }
 @keyframes fadeIn {
     from { opacity: 0; transform: translateY(10px); }
     to { opacity: 1; transform: translateY(0); }
 }
-.message.user { align-self: flex-end; flex-direction: row-reverse; }
 .message-content {
-    padding: 8px 14px;
-    border-radius: 16px;
-    font-size: 0.9rem;
     line-height: 1.4;
     word-wrap: break-word;
 }
 .user .message-content {
-    background: #667eea;
     color: white;
-    border-radius: 16px 4px 16px 16px;
 }
 .bot .message-content {
-    background: #1e1e2e;
-    color: #e0e0e0;
-    border-radius: 4px 16px 16px 16px;
-    border: 1px solid #2a2a4a;
 }
-.input-area {
-    padding: 12px 16px;
-    background: #0f0f15;
-    border-top: 1px solid #1a1a2e;
 }
 .input-wrapper {
     display: flex;
-    gap: 10px;
     max-width: 800px;
     margin: 0 auto;
 }
-#input {
     flex: 1;
-    padding: 10px 14px;
-    background: #1a1a2e;
-    border: 1px solid #2a2a4a;
-    border-radius: 22px;
     color: white;
-    font-size: 0.9rem;
     outline: none;
 }
-#input:focus { border-color: #667eea; }
-#send {
-    padding: 10px 20px;
-    background: #667eea;
     border: none;
-    border-radius: 22px;
     color: white;
-    font-weight: 600;
     cursor: pointer;
 }
-#send:hover { opacity: 0.9; }
-#send:disabled { opacity: 0.5; cursor: not-allowed; }
 .typing {
     display: flex;
     gap: 4px;
-    padding: 8px 14px;
 }
 .typing span {
-    width: 6px;
-    height: 6px;
     background: #888;
     border-radius: 50%;
-    animation: bounce 1.4s infinite;
 }
 .typing span:nth-child(2) { animation-delay: -0.16s; }
-.typing span:nth-child(3) { animation-delay: -0.32s; }
 @keyframes bounce {
     0%, 80%, 100% { transform: scale(0); }
     40% { transform: scale(1); }
 }
-.dot {
-    display: inline-block;
-    width: 8px;
-    height: 8px;
-    background: #4ade80;
-    border-radius: 50%;
-    margin-right: 6px;
-    animation: pulse 2s infinite;
 }
-@keyframes pulse {
-    0%, 100% { opacity: 1; }
-    50% { opacity: 0.5; }
 }
 </style>
 </head>
 <body>
-<div class="header">
-    <h1><span class="dot"></span> MTP Assistant</h1>
-    <p>DialoGPT - Modelo conversacional real</p>
 </div>
-<div class="chat" id="chat">
     <div class="message bot">
-        <div class="message-content">¡Hola! Soy MTP, tu asistente. ¿En qué puedo ayudarte hoy?</div>
     </div>
 </div>
-<div class="input-area">
     <div class="input-wrapper">
-        <input type="text" id="input" placeholder="Escribe tu mensaje..." autocomplete="off">
-        <button id="send">Enviar</button>
     </div>
 </div>
 <script>
-const chat = document.getElementById('chat');
-const input = document.getElementById('input');
-const sendBtn = document.getElementById('send');
-let loading = false;
 function addMessage(text, isUser) {
     const div = document.createElement('div');
     div.className = `message ${isUser ? 'user' : 'bot'}`;
     div.innerHTML = `<div class="message-content">${escapeHtml(text)}</div>`;
-    chat.appendChild(div);
-    chat.scrollTop = chat.scrollHeight;
 }
 function escapeHtml(text) {
-    return text.replace(/</g, '&lt;').replace(/>/g, '&gt;');
 }
-function addTyping() {
     const div = document.createElement('div');
     div.className = 'message bot';
-    div.id = 'typing';
     div.innerHTML = `<div class="typing"><span></span><span></span><span></span></div>`;
-    chat.appendChild(div);
-    chat.scrollTop = chat.scrollHeight;
 }
-function removeTyping() {
-    const t = document.getElementById('typing');
-    if (t) t.remove();
 }
-async function send() {
-    const text = input.value.trim();
-    if (!text || loading) return;
-    input.value = '';
-    addMessage(text, true);
-    loading = true;
     sendBtn.disabled = true;
-    addTyping();
     try {
-        const res = await fetch('/generate', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
-            body: JSON.stringify({ text: text })
         });
-        const data = await res.json();
-        removeTyping();
-        addMessage(data.reply || "No pude generar respuesta.", false);
-    } catch (err) {
-        removeTyping();
-        addMessage("Error de conexión. Intenta de nuevo.", false);
     } finally {
-        loading = false;
         sendBtn.disabled = false;
-        input.focus();
     }
 }
-input.addEventListener('keypress', (e) => {
-    if (e.key === 'Enter') send();
 });
-sendBtn.addEventListener('click', send);
-input.focus();
 </script>
 </body>
 </html>
 """
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
-    print(f"\n🚀 Servidor: http://0.0.0.0:{port}")
-    uvicorn.run(app, host="0.0.0.0", port=port, log_level="warning")

 import os
+import sys
 import torch
+import json
+import time
+import gc
+import re
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse, StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from huggingface_hub import snapshot_download
 import uvicorn
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+import sentencepiece as spm
+# ======================
+# CONFIGURACIÓN DE DISPOSITIVO
+# ======================
+if torch.cuda.is_available():
+    DEVICE = "cuda"
+    print("✅ GPU NVIDIA detectada. Usando CUDA.")
+else:
+    DEVICE = "cpu"
+    print("⚠️ GPU no detectada. Usando CPU (puede ser más lento).")
+if DEVICE == "cpu":
+    torch.set_num_threads(max(1, os.cpu_count() // 2))
+torch.set_grad_enabled(False)
+# CONFIGURACIÓN DEL MODELO - ACTUALIZADO A VERSIÓN 3.3.1
+MODEL_REPO = "TeszenAI/MTP-3.3.1"
+# ======================
+# FUNCIONES DE LIMPIEZA Y CONTROL DE CALIDAD
+# ======================
+def clean_response(text: str, user_input: str = "") -> str:
     """Limpia la respuesta del modelo"""
     if not text:
         return ""
+    # Eliminar repeticiones excesivas de palabras
+    words = text.split()
+    cleaned_words = []
+    last_word = ""
+    repeat_count = 0
+    for word in words:
+        if word.lower() == last_word.lower():
+            repeat_count += 1
+            if repeat_count > 2:
+                continue
+        else:
+            last_word = word
+            repeat_count = 0
+        cleaned_words.append(word)
+    text = " ".join(cleaned_words)
+    # Eliminar caracteres repetidos excesivamente
+    text = re.sub(r'(.)\1{4,}', r'\1\1', text)
+    # Detectar si es un saludo (más completo)
+    greetings = [
+        "hola", "hola!", "hola.", "buenas", "saludos", "hola?",
+        "buenos días", "buenas tardes", "buenas noches", "hey",
+        "hola!", "que tal", "cómo estás", "como estas"
+    ]
+    is_greeting = user_input.lower().strip() in greetings
+    if is_greeting and text:
+        # Para saludos, tomar solo la primera oración
+        first_sentence = text.split('.')[0].strip()
+        if len(first_sentence) > 5 and len(first_sentence) < 100:
+            text = first_sentence
+        elif len(text) > 80:
+            text = text[:80]
+        # Asegurar que termine con punto si es un saludo
+        if text and text[-1] not in '.!?':
+            text += '.'
+    # Si la respuesta es muy corta o vacía
+    if len(text.strip()) < 5:
+        if is_greeting:
+            return "¡Hola! ¿En qué puedo ayudarte?"
+        return "Lo siento, no pude generar una respuesta clara. ¿Podrías reformular tu pregunta?"
+    # Eliminar espacios múltiples y limpiar
     text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# ======================
+# DEFINIR ARQUITECTURA DEL MODELO (MTP V3.3.1)
+# ======================
+class LayerNorm(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(d_model))
+        self.bias = nn.Parameter(torch.zeros(d_model))
+        self.eps = eps
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.weight * (x - mean) / (std + self.eps) + self.bias
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.d_k = d_model // n_heads
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        self.w_o = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.scale = math.sqrt(self.d_k)
+    def forward(self, x, mask=None):
+        batch_size, seq_len, _ = x.shape
+        Q = self.w_q(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        K = self.w_k(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        V = self.w_v(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        attn_output = torch.matmul(attn_weights, V)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
+        return self.w_o(attn_output)
+class FeedForward(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff)
+        self.linear2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.linear2(self.dropout(F.gelu(self.linear1(x))))
+class TransformerBlock(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
+        self.feed_forward = FeedForward(d_model, d_ff, dropout)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        attn_output = self.attention(x, mask)
+        x = x + self.dropout1(attn_output)
+        x = self.norm1(x)
+        ff_output = self.feed_forward(x)
+        x = x + self.dropout2(ff_output)
+        x = self.norm2(x)
+        return x
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, max_len: int = 5000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1), :]
+class MTPModel(nn.Module):
+    def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8,
+                 n_layers: int = 6, d_ff: int = 1024, dropout: float = 0.1, max_len: int = 512):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.max_len = max_len
+        self.token_embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoding = PositionalEncoding(d_model, max_len)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)
+        ])
+        self.norm = LayerNorm(d_model)
+        self.lm_head = nn.Linear(d_model, vocab_size)
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.tril(torch.ones(x.size(1), x.size(1))).unsqueeze(0).unsqueeze(0).to(x.device)
+        x = self.token_embedding(x) * math.sqrt(self.d_model)
+        x = self.pos_encoding(x)
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        return logits
+    def generate(self, input_ids, max_new_tokens=150, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.1):
+        """Genera texto token por token"""
+        generated = input_ids
+        eos_id = 3  # EOS token id en SentencePiece
+        for step in range(max_new_tokens):
+            with torch.no_grad():
+                logits = self(generated)
+                next_logits = logits[0, -1, :] / temperature
+            if repetition_penalty != 1.0:
+                for token_id in set(generated[0].tolist()):
+                    next_logits[token_id] /= repetition_penalty
+            if top_k > 0:
+                indices_to_remove = next_logits < torch.topk(next_logits, min(top_k, next_logits.size(-1)))[0][..., -1, None]
+                next_logits[indices_to_remove] = float('-inf')
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                next_logits[indices_to_remove] = float('-inf')
+            probs = F.softmax(next_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1).item()
+            if next_token == eos_id:
+                break
+            generated = torch.cat([generated, torch.tensor([[next_token]], device=generated.device)], dim=1)
+        return generated
+# ======================
+# DESCARGA Y CARGA DEL MODELO
+# ======================
+print(f"📦 Descargando modelo desde {MODEL_REPO}...")
+repo_path = snapshot_download(
+    repo_id=MODEL_REPO,
+    repo_type="model",
+    local_dir="mtp_repo"
+)
+# Cargar configuración
+config_path = os.path.join(repo_path, "config.json")
+if os.path.exists(config_path):
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    print(f"✅ Configuración cargada: {config}")
+else:
+    # Configuración por defecto para MTP V3.3.1
+    config = {
+        "vocab_size": 4000,
+        "d_model": 256,
+        "n_heads": 8,
+        "n_layers": 6,
+        "d_ff": 1024,
+        "dropout": 0.1,
+        "max_len": 512
+    }
+    print(f"⚠️ Usando configuración por defecto: {config}")
+# Cargar tokenizador
+tokenizer_path = os.path.join(repo_path, "mtp_tokenizer.model")
+if not os.path.exists(tokenizer_path):
+    print(f"❌ Tokenizador no encontrado en {tokenizer_path}")
+    sys.exit(1)
+sp = spm.SentencePieceProcessor()
+sp.load(tokenizer_path)
+VOCAB_SIZE = sp.get_piece_size()
+print(f"✅ Tokenizador cargado. Vocabulario: {VOCAB_SIZE}")
+# Actualizar vocab_size en config
+config["vocab_size"] = VOCAB_SIZE
+print(f"\n🧠 Inicializando modelo MTP V3.3.1...")
+print(f"   → Vocabulario: {VOCAB_SIZE}")
+print(f"   → Dimensión: {config['d_model']}")
+print(f"   → Capas: {config['n_layers']}")
+print(f"   → Heads: {config['n_heads']}")
+print(f"   → FFN dimensión: {config['d_ff']}")
+print(f"   → Max length: {config['max_len']}")
+model = MTPModel(**config)
+model.to(DEVICE)
+# Cargar pesos del modelo
+model_path = os.path.join(repo_path, "mtp_model.pt")
+if os.path.exists(model_path):
+    try:
+        state_dict = torch.load(model_path, map_location=DEVICE)
+        model.load_state_dict(state_dict, strict=False)
+        print("✅ Pesos del modelo cargados exitosamente")
+    except Exception as e:
+        print(f"⚠️ Error cargando pesos: {e}")
+        print("   Continuando con pesos aleatorios...")
+else:
+    print(f"⚠️ No se encontró {model_path}, usando pesos aleatorios")
+model.eval()
+param_count = sum(p.numel() for p in model.parameters())
+print(f"✅ Modelo listo: {param_count:,} parámetros ({param_count/1e6:.2f}M)")
+# ======================
+# API CONFIG
+# ======================
+app = FastAPI(
+    title="MTP API V3.3.1",
+    description="API para modelo de lenguaje MTP - Asistente IA entrenado desde cero",
+    version="3.3.1"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class PromptRequest(BaseModel):
+    text: str = Field(..., max_length=2000, description="Texto de entrada")
+    max_tokens: int = Field(default=150, ge=10, le=300, description="Tokens máximos a generar")
+    temperature: float = Field(default=0.7, ge=0.1, le=2.0, description="Temperatura de muestreo")
+    top_k: int = Field(default=50, ge=1, le=100, description="Top-k sampling")
+    top_p: float = Field(default=0.9, ge=0.1, le=1.0, description="Top-p (nucleus) sampling")
+    repetition_penalty: float = Field(default=1.1, ge=1.0, le=2.0, description="Penalización por repetición")
+def build_prompt(user_input: str) -> str:
+    """Construye el prompt en el formato del modelo (Alpaca style)"""
+    return f"### Instrucción:\n{user_input}\n\n### Respuesta:\n"
+# ======================
+# GESTIÓN DE CARGA
+# ======================
+ACTIVE_REQUESTS = 0
+class MTPTokenizer:
+    """Wrapper para el tokenizador de SentencePiece"""
+    def __init__(self, sp_model):
+        self.sp = sp_model
+    def encode(self, text):
+        return self.sp.encode(text)
+    def decode(self, tokens):
+        return self.sp.decode(tokens)
+    def bos_id(self):
+        return self.sp.bos_id()
+    def eos_id(self):
+        return self.sp.eos_id()
+    def pad_id(self):
+        return self.sp.pad_id()
+tokenizer_wrapper = MTPTokenizer(sp)
+# ======================
+# ENDPOINT PRINCIPAL
+# ======================
+@app.post("/generate")
+async def generate(req: PromptRequest):
+    """Endpoint principal de generación de texto"""
+    global ACTIVE_REQUESTS
+    ACTIVE_REQUESTS += 1
+    user_input = req.text.strip()
+    if not user_input:
+        ACTIVE_REQUESTS -= 1
+        return {"reply": "", "tokens_generated": 0}
+    # Detectar si es un saludo
+    greetings = [
+        "hola", "hola!", "hola.", "buenas", "saludos", "hola?",
+        "buenos días", "buenas tardes", "buenas noches", "hey",
+        "que tal", "cómo estás", "como estas"
+    ]
+    is_greeting = user_input.lower().strip() in greetings
+    # Si es saludo, usar menos tokens y temperatura más alta para respuestas creativas
+    if is_greeting:
+        max_tokens = 30
+        temperature = 0.8
+    else:
+        max_tokens = req.max_tokens
+        temperature = req.temperature
+    full_prompt = build_prompt(user_input)
+    tokens = tokenizer_wrapper.encode(full_prompt)
+    input_ids = torch.tensor([tokens], device=DEVICE)
+    try:
+        start_time = time.time()
+        with torch.no_grad():
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_k=req.top_k,
+                top_p=req.top_p,
+                repetition_penalty=req.repetition_penalty
+            )
+        inference_time = time.time() - start_time
+        # Extraer solo los tokens generados (no el prompt)
+        gen_tokens = output_ids[0, len(tokens):].tolist()
+        # Filtrar tokens inválidos
+        safe_tokens = [t for t in gen_tokens if 0 <= t < VOCAB_SIZE and t != 0]  # 0 es pad
+        if safe_tokens:
+            response = tokenizer_wrapper.decode(safe_tokens).strip()
+        else:
+            response = ""
+        # Limpiar respuesta
+        response = clean_response(response, user_input)
+        # Si la respuesta sigue vacía o es muy corta, usar respuesta por defecto
+        if len(response) < 3:
+            if is_greeting:
+                response = "¡Hola! ¿En qué puedo ayudarte?"
+            else:
+                response = "Lo siento, no pude generar una respuesta clara. ¿Podrías reformular tu pregunta?"
+        return {
+            "reply": response,
+            "tokens_generated": len(safe_tokens),
+            "inference_time": round(inference_time, 3),
+            "model": "MTP-3.3.1",
+            "input_tokens": len(tokens)
+        }
+    except Exception as e:
+        print(f"❌ Error durante generación: {e}")
+        import traceback
+        traceback.print_exc()
+        if is_greeting:
+            fallback = "¡Hola! ¿En qué puedo ayudarte?"
+        else:
+            fallback = "Lo siento, ocurrió un error al procesar tu solicitud. Intenta de nuevo."
+        return {
+            "reply": fallback,
+            "error": str(e),
+            "model": "MTP-3.3.1"
+        }
+    finally:
+        ACTIVE_REQUESTS -= 1
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+# ======================
+# ENDPOINTS DE INFORMACIÓN
+# ======================
 @app.get("/health")
+def health_check():
+    return {
+        "status": "healthy",
+        "model": "MTP-3.3.1",
+        "device": DEVICE,
+        "active_requests": ACTIVE_REQUESTS,
+        "vocab_size": VOCAB_SIZE,
+        "total_params": param_count
+    }
+@app.get("/info")
+def model_info():
+    return {
+        "model_name": "MTP",
+        "version": "3.3.1",
+        "architecture": config,
+        "parameters": param_count,
+        "parameters_millions": round(param_count / 1e6, 2),
+        "device": DEVICE,
+        "tokenizer_vocab": VOCAB_SIZE,
+        "repo": MODEL_REPO
+    }
+# ======================
+# INTERFAZ WEB MEJORADA
+# ======================
 @app.get("/", response_class=HTMLResponse)
 def chat_ui():
     return """
 <!DOCTYPE html>
+<html lang="es">
 <head>
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>MTP V3.3.1 - Asistente IA</title>
 <style>
 * { margin: 0; padding: 0; box-sizing: border-box; }
 body {
+    background: linear-gradient(135deg, #0a0a0a 0%, #1a1a2e 100%);
     font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
     height: 100vh;
     display: flex;
     flex-direction: column;
 }
+.chat-header {
     padding: 16px 20px;
+    background: rgba(0, 0, 0, 0.7);
+    backdrop-filter: blur(10px);
+    border-bottom: 1px solid rgba(255,255,255,0.1);
 }
+.chat-header h1 {
+    color: white;
+    font-size: 1.3rem;
+    font-weight: 600;
+}
+.chat-header p {
+    color: #888;
+    font-size: 0.8rem;
+    margin-top: 4px;
+}
+.chat-messages {
     flex: 1;
     overflow-y: auto;
+    padding: 20px;
     display: flex;
     flex-direction: column;
+    gap: 16px;
 }
 .message {
     display: flex;
+    gap: 12px;
+    max-width: 80%;
+    animation: fadeIn 0.3s ease;
 }
 @keyframes fadeIn {
     from { opacity: 0; transform: translateY(10px); }
     to { opacity: 1; transform: translateY(0); }
 }
+.message.user {
+    align-self: flex-end;
+    flex-direction: row-reverse;
+}
 .message-content {
+    padding: 12px 18px;
+    border-radius: 20px;
+    font-size: 0.95rem;
     line-height: 1.4;
     word-wrap: break-word;
+    max-width: 100%;
 }
 .user .message-content {
+    background: linear-gradient(135deg, #667eea, #764ba2);
     color: white;
+    border-radius: 20px 4px 20px 20px;
 }
 .bot .message-content {
+    background: rgba(30, 30, 40, 0.9);
+    color: #e3e3e3;
+    border-radius: 4px 20px 20px 20px;
+    border: 1px solid rgba(255,255,255,0.05);
 }
+.chat-input-container {
+    padding: 16px 20px;
+    background: rgba(0, 0, 0, 0.7);
+    backdrop-filter: blur(10px);
+    border-top: 1px solid rgba(255,255,255,0.1);
 }
 .input-wrapper {
     display: flex;
+    gap: 12px;
     max-width: 800px;
     margin: 0 auto;
 }
+#messageInput {
     flex: 1;
+    padding: 12px 16px;
+    background: rgba(255,255,255,0.1);
+    border: 1px solid rgba(255,255,255,0.2);
+    border-radius: 24px;
     color: white;
+    font-size: 0.95rem;
     outline: none;
+    transition: all 0.2s;
 }
+#messageInput:focus {
+    border-color: #667eea;
+    background: rgba(255,255,255,0.15);
+}
+#messageInput::placeholder {
+    color: #888;
+}
+#sendBtn {
+    padding: 12px 24px;
+    background: linear-gradient(135deg, #667eea, #764ba2);
     border: none;
+    border-radius: 24px;
     color: white;
+    font-weight: 500;
     cursor: pointer;
+    transition: all 0.2s;
+}
+#sendBtn:hover {
+    transform: scale(1.02);
+    opacity: 0.9;
+}
+#sendBtn:disabled {
+    opacity: 0.5;
+    transform: none;
+    cursor: not-allowed;
 }
 .typing {
     display: flex;
     gap: 4px;
+    padding: 12px 18px;
 }
 .typing span {
+    width: 8px;
+    height: 8px;
     background: #888;
     border-radius: 50%;
+    animation: bounce 1.4s infinite ease-in-out;
 }
+.typing span:nth-child(1) { animation-delay: -0.32s; }
 .typing span:nth-child(2) { animation-delay: -0.16s; }
 @keyframes bounce {
     0%, 80%, 100% { transform: scale(0); }
     40% { transform: scale(1); }
 }
+.suggestions {
+    display: flex;
+    gap: 10px;
+    padding: 12px 20px;
+    overflow-x: auto;
+    background: rgba(0,0,0,0.3);
+}
+.suggestion {
+    padding: 6px 14px;
+    background: rgba(255,255,255,0.1);
+    border-radius: 20px;
+    color: #aaa;
+    font-size: 0.8rem;
+    cursor: pointer;
+    transition: all 0.2s;
+    white-space: nowrap;
+}
+.suggestion:hover {
+    background: linear-gradient(135deg, #667eea, #764ba2);
+    color: white;
 }
+.version-badge {
+    position: fixed;
+    bottom: 10px;
+    right: 10px;
+    background: rgba(0,0,0,0.5);
+    padding: 4px 10px;
+    border-radius: 20px;
+    font-size: 0.7rem;
+    color: #888;
+    font-family: monospace;
+}
+@media (max-width: 600px) {
+    .message { max-width: 95%; }
+    .suggestions { display: none; }
 }
 </style>
 </head>
 <body>
+<div class="chat-header">
+    <h1>🤖 MTP V3.3.1 - Mi Transformer Pequeño</h1>
+    <p>Asistente IA entrenado desde cero con arquitectura Transformer | 15M parámetros</p>
+</div>
+<div class="suggestions">
+    <div class="suggestion">Hola</div>
+    <div class="suggestion">¿Quién eres?</div>
+    <div class="suggestion">¿Qué puedes hacer?</div>
+    <div class="suggestion">Explícame la IA</div>
+    <div class="suggestion">Háblame de BTS</div>
+    <div class="suggestion">¿Qué es un agujero negro?</div>
+    <div class="suggestion">Dime un chiste</div>
+    <div class="suggestion">Adiós</div>
 </div>
+<div class="chat-messages" id="chatMessages">
     <div class="message bot">
+        <div class="message-content">✨ ¡Hola! Soy MTP versión 3.3.1, tu asistente de IA entrenado desde cero. Puedo hablar de ciencia, K-Pop (BTS, BLACKPINK), tecnología, filosofía y mucho más. ¿En qué puedo ayudarte hoy?</div>
     </div>
 </div>
+<div class="chat-input-container">
     <div class="input-wrapper">
+        <input type="text" id="messageInput" placeholder="Escribe tu mensaje aquí..." autocomplete="off">
+        <button id="sendBtn">Enviar</button>
     </div>
 </div>
+<div class="version-badge">MTP-3.3.1 | Transformer</div>
 <script>
+const chatMessages = document.getElementById('chatMessages');
+const messageInput = document.getElementById('messageInput');
+const sendBtn = document.getElementById('sendBtn');
+let isLoading = false;
 function addMessage(text, isUser) {
     const div = document.createElement('div');
     div.className = `message ${isUser ? 'user' : 'bot'}`;
     div.innerHTML = `<div class="message-content">${escapeHtml(text)}</div>`;
+    chatMessages.appendChild(div);
+    chatMessages.scrollTop = chatMessages.scrollHeight;
+    return div;
 }
 function escapeHtml(text) {
+    const div = document.createElement('div');
+    div.textContent = text;
+    return div.innerHTML;
 }
+function addTypingIndicator() {
     const div = document.createElement('div');
     div.className = 'message bot';
+    div.id = 'typingIndicator';
     div.innerHTML = `<div class="typing"><span></span><span></span><span></span></div>`;
+    chatMessages.appendChild(div);
+    chatMessages.scrollTop = chatMessages.scrollHeight;
 }
+function removeTypingIndicator() {
+    const indicator = document.getElementById('typingIndicator');
+    if (indicator) indicator.remove();
 }
+async function sendMessage(text = null) {
+    const messageText = text || messageInput.value.trim();
+    if (!messageText || isLoading) return;
+    if (!text) messageInput.value = '';
+    addMessage(messageText, true);
+    isLoading = true;
     sendBtn.disabled = true;
+    addTypingIndicator();
     try {
+        const response = await fetch('/generate', {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ text: messageText })
         });
+        const data = await response.json();
+        removeTypingIndicator();
+        addMessage(data.reply, false);
+    } catch (error) {
+        removeTypingIndicator();
+        addMessage('⚠️ Error de conexión. Por favor, intenta de nuevo.', false);
     } finally {
+        isLoading = false;
         sendBtn.disabled = false;
+        messageInput.focus();
     }
 }
+messageInput.addEventListener('keypress', (e) => {
+    if (e.key === 'Enter') sendMessage();
 });
+sendBtn.addEventListener('click', () => sendMessage());
+document.querySelectorAll('.suggestion').forEach(el => {
+    el.addEventListener('click', () => sendMessage(el.textContent));
+});
+messageInput.focus();
 </script>
 </body>
 </html>
 """
+# ======================
+# MAIN
+# ======================
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
+    print("\n" + "=" * 60)
+    print(f"🚀 Iniciando servidor MTP V3.3.1 en puerto {port}...")
+    print(f"🌐 Interfaz web: http://0.0.0.0:{port}")
+    print(f"📡 API docs: http://0.0.0.0:{port}/docs")
+    print(f"❤️ Health check: http://0.0.0.0:{port}/health")
+    print("=" * 60)
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=port,
+        log_level="info"
+    )