Spaces:

TeszenAI
/

MTP-3.1.2

Sleeping

App Files Files Community

teszenofficial commited on Apr 9

Commit

facf38f

verified ·

1 Parent(s): c4b6ca5

Upload app.py

Browse files

Files changed (1) hide show

app.py +931 -0

app.py ADDED Viewed

	@@ -0,0 +1,931 @@

+import os
+import sys
+import torch
+import json
+import time
+import gc
+import re
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from huggingface_hub import snapshot_download
+import uvicorn
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+import sentencepiece as spm
+# ======================
+# CONFIGURACIÓN DE DISPOSITIVO
+# ======================
+if torch.cuda.is_available():
+    DEVICE = "cuda"
+    print("✅ GPU NVIDIA detectada. Usando CUDA.")
+else:
+    DEVICE = "cpu"
+    print("⚠️ GPU no detectada. Usando CPU (puede ser más lento).")
+if DEVICE == "cpu":
+    torch.set_num_threads(max(1, os.cpu_count() // 2))
+torch.set_grad_enabled(False)
+# CAMBIA ESTO POR EL NOMBRE DE TU REPO EN HUGGING FACE
+MODEL_REPO = "TeszenAI/MTP-3.1.1"  # <-- CAMBIA A TU REPO
+# ======================
+# FUNCIONES DE LIMPIEZA Y CONTROL DE CALIDAD
+# ======================
+def clean_response(text: str) -> str:
+    """
+    Limpia la respuesta eliminando repeticiones, frases sin sentido y
+    asegurando que termine correctamente.
+    """
+    if not text:
+        return ""
+    # 1. Eliminar repeticiones excesivas de palabras o frases cortas
+    words = text.split()
+    cleaned_words = []
+    last_phrase = ""
+    repeat_count = 0
+    for word in words:
+        if word == last_phrase:
+            repeat_count += 1
+            if repeat_count > 2:  # Si repite más de 2 veces seguidas
+                continue
+        else:
+            last_phrase = word
+            repeat_count = 0
+        cleaned_words.append(word)
+    text = " ".join(cleaned_words)
+    # 2. Eliminar patrones sin sentido (repeticiones de letras, caracteres raros)
+    text = re.sub(r'(.)\1{4,}', r'\1\1', text)  # aaa... -> aa
+    text = re.sub(r'[^a-zA-ZáéíóúñüÁÉÍÓÚÑÜ0-9\s.,;:!?¿¡()\-"]+', '', text)
+    # 3. Cortar en la primera frase que parezca final coherente
+    stop_patterns = [
+        r'(\.\s*)$',           # Punto final
+        r'[.!?](\s+)?$',       # Fin de oración
+        r'(gracias|hasta luego|adiós|saludos|fin|fin del mensaje)$',
+        r'(¿algo más\?|¿necesitas algo más\?|¿en qué más puedo ayudarte\?)'
+    ]
+    for pattern in stop_patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            # Cortar justo después del patrón de finalización
+            end_pos = match.end()
+            text = text[:end_pos]
+            break
+    # 4. Si la respuesta es muy corta o vacía, devolver mensaje por defecto
+    if len(text.strip()) < 10:
+        return "Lo siento, no pude generar una respuesta clara. ¿Podrías reformular tu pregunta?"
+    # 5. Eliminar espacios múltiples y saltos de línea excesivos
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def should_stop_generation(generated_text: str, min_length: int = 30, max_length: int = 300) -> bool:
+    """
+    Determina si debemos detener la generación basado en el texto generado.
+    """
+    # Si ya superamos la longitud máxima
+    if len(generated_text) > max_length:
+        return True
+    # Si es muy corto y no hay puntuación final
+    if len(generated_text) < min_length and not re.search(r'[.!?]$', generated_text):
+        return False
+    # Señales de que ya terminó la respuesta
+    stop_signals = [
+        r'(gracias por tu pregunta|espero haberte ayudado|¿necesitas algo más\?)',
+        r'(hasta luego|adiós|quedo atento|saludos cordiales)',
+        r'(fin del mensaje|fin de la conversación)'
+    ]
+    for signal in stop_signals:
+        if re.search(signal, generated_text, re.IGNORECASE):
+            return True
+    # Si la última frase parece completa
+    last_sentence = generated_text.split('.')[-1].strip()
+    if len(last_sentence) > 5 and re.search(r'[.!?]$', last_sentence):
+        # Y ya hemos generado suficiente contenido
+        if len(generated_text) > min_length:
+            return True
+    return False
+# ======================
+# DEFINIR ARQUITECTURA DEL MODELO (MTP)
+# ======================
+class LayerNorm(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(d_model))
+        self.bias = nn.Parameter(torch.zeros(d_model))
+        self.eps = eps
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.weight * (x - mean) / (std + self.eps) + self.bias
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.d_k = d_model // n_heads
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        self.w_o = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.scale = math.sqrt(self.d_k)
+    def forward(self, x, mask=None):
+        batch_size, seq_len, _ = x.shape
+        Q = self.w_q(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        K = self.w_k(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        V = self.w_v(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        attn_output = torch.matmul(attn_weights, V)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
+        return self.w_o(attn_output)
+class FeedForward(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff)
+        self.linear2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.linear2(self.dropout(F.gelu(self.linear1(x))))
+class TransformerBlock(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
+        self.feed_forward = FeedForward(d_model, d_ff, dropout)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        attn_output = self.attention(x, mask)
+        x = x + self.dropout1(attn_output)
+        x = self.norm1(x)
+        ff_output = self.feed_forward(x)
+        x = x + self.dropout2(ff_output)
+        x = self.norm2(x)
+        return x
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, max_len: int = 5000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1), :]
+class MTPModel(nn.Module):
+    def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8,
+                 n_layers: int = 6, d_ff: int = 1024, dropout: float = 0.1, max_len: int = 512):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.max_len = max_len
+        self.token_embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoding = PositionalEncoding(d_model, max_len)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)
+        ])
+        self.norm = LayerNorm(d_model)
+        self.lm_head = nn.Linear(d_model, vocab_size)
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.tril(torch.ones(x.size(1), x.size(1))).unsqueeze(0).unsqueeze(0).to(x.device)
+        x = self.token_embedding(x) * math.sqrt(self.d_model)
+        x = self.pos_encoding(x)
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        return logits
+    def generate(self, input_ids, max_new_tokens=150, temperature=0.8, top_k=50, top_p=0.9, repetition_penalty=1.1):
+        """Método de generación mejorado con detección inteligente de fin"""
+        generated = input_ids
+        generated_text = ""
+        min_response_length = 30
+        max_response_length = max_new_tokens * 2
+        for step in range(max_new_tokens):
+            with torch.no_grad():
+                logits = self(generated)
+                next_logits = logits[0, -1, :] / temperature
+            if repetition_penalty != 1.0:
+                for token_id in set(generated[0].tolist()):
+                    next_logits[token_id] /= repetition_penalty
+            if top_k > 0:
+                indices_to_remove = next_logits < torch.topk(next_logits, top_k)[0][..., -1, None]
+                next_logits[indices_to_remove] = float('-inf')
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                next_logits[indices_to_remove] = float('-inf')
+            probs = F.softmax(next_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1).item()
+            if next_token == 3:  # EOS ID para SentencePiece
+                break
+            generated = torch.cat([generated, torch.tensor([[next_token]], device=generated.device)], dim=1)
+            # Decodificar parcialmente para verificar si debemos parar (solo cada 10 pasos para eficiencia)
+            if step > 10 and step % 10 == 0:
+                # Intentar decodificar tokens generados (esto es aproximado, el tokenizador real está fuera)
+                if len(generated[0]) > 10:
+                    if should_stop_generation(str(generated[0].tolist()), min_response_length, max_response_length):
+                        break
+        return generated
+# ======================
+# DESCARGA Y CARGA DEL MODELO
+# ======================
+print(f"📦 Descargando modelo desde {MODEL_REPO}...")
+repo_path = snapshot_download(
+    repo_id=MODEL_REPO,
+    repo_type="model",
+    local_dir="mtp_repo"
+)
+# Cargar configuración
+config_path = os.path.join(repo_path, "config.json")
+if os.path.exists(config_path):
+    with open(config_path, "r") as f:
+        config = json.load(f)
+else:
+    config = {
+        "vocab_size": 5000,
+        "d_model": 256,
+        "n_heads": 8,
+        "n_layers": 6,
+        "d_ff": 1024,
+        "dropout": 0.1,
+        "max_len": 512
+    }
+# Cargar tokenizador
+tokenizer_path = os.path.join(repo_path, "mtp_tokenizer.model")
+sp = spm.SentencePieceProcessor()
+sp.load(tokenizer_path)
+VOCAB_SIZE = sp.get_piece_size()
+# Actualizar vocab_size en config
+config["vocab_size"] = VOCAB_SIZE
+print(f"🧠 Inicializando modelo MTP...")
+print(f"   → Vocabulario: {VOCAB_SIZE}")
+print(f"   → Dimensión: {config['d_model']}")
+print(f"   → Capas: {config['n_layers']}")
+print(f"   → Heads: {config['n_heads']}")
+model = MTPModel(**config)
+model.to(DEVICE)
+# Cargar pesos del modelo
+model_path = os.path.join(repo_path, "mtp_model.pt")
+if os.path.exists(model_path):
+    state_dict = torch.load(model_path, map_location=DEVICE)
+    model.load_state_dict(state_dict)
+    print("✅ Pesos del modelo cargados")
+else:
+    print("⚠️ No se encontró mtp_model.pt, usando pesos aleatorios")
+model.eval()
+# Cuantización para CPU
+if DEVICE == "cpu":
+    print("⚡ Aplicando cuantización dinámica para CPU...")
+    try:
+        model = torch.quantization.quantize_dynamic(
+            model,
+            {nn.Linear},
+            dtype=torch.qint8
+        )
+    except Exception as e:
+        print(f"⚠️ No se pudo aplicar cuantización: {e}")
+param_count = sum(p.numel() for p in model.parameters())
+print(f"✅ Modelo cargado: {param_count:,} parámetros ({param_count/1e6:.1f}M)")
+# ======================
+# API CONFIG
+# ======================
+app = FastAPI(
+    title="MTP API",
+    description="API para modelo de lenguaje MTP",
+    version="1.0"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class PromptRequest(BaseModel):
+    text: str = Field(..., max_length=2000, description="Texto de entrada")
+    max_tokens: int = Field(default=150, ge=10, le=300, description="Tokens máximos a generar")
+    temperature: float = Field(default=0.7, ge=0.1, le=2.0, description="Temperatura de muestreo")
+    top_k: int = Field(default=50, ge=1, le=100, description="Top-k sampling")
+    top_p: float = Field(default=0.9, ge=0.1, le=1.0, description="Top-p (nucleus) sampling")
+    repetition_penalty: float = Field(default=1.1, ge=1.0, le=2.0, description="Penalización por repetición")
+def build_prompt(user_input: str) -> str:
+    """Construye el prompt en el formato del modelo"""
+    return f"### Instrucción:\n{user_input}\n\n### Respuesta:\n"
+# ======================
+# GESTIÓN DE CARGA
+# ======================
+ACTIVE_REQUESTS = 0
+class MTPTokenizer:
+    """Wrapper para el tokenizador de SentencePiece"""
+    def __init__(self, sp_model):
+        self.sp = sp_model
+    def encode(self, text):
+        return self.sp.encode(text)
+    def decode(self, tokens):
+        return self.sp.decode(tokens)
+    def bos_id(self):
+        return self.sp.bos_id()
+    def eos_id(self):
+        return self.sp.eos_id()
+    def pad_id(self):
+        return self.sp.pad_id()
+tokenizer_wrapper = MTPTokenizer(sp)
+@app.post("/generate")
+async def generate(req: PromptRequest):
+    """Endpoint principal de generación de texto"""
+    global ACTIVE_REQUESTS
+    ACTIVE_REQUESTS += 1
+    dyn_max_tokens = req.max_tokens
+    dyn_temperature = req.temperature
+    if ACTIVE_REQUESTS > 2:
+        print(f"⚠️ Carga alta ({ACTIVE_REQUESTS} requests). Ajustando parámetros.")
+        dyn_max_tokens = min(dyn_max_tokens, 120)
+        dyn_temperature = max(0.5, dyn_temperature * 0.9)
+    user_input = req.text.strip()
+    if not user_input:
+        ACTIVE_REQUESTS -= 1
+        return {"reply": "", "tokens_generated": 0}
+    full_prompt = build_prompt(user_input)
+    tokens = [tokenizer_wrapper.bos_id()] + tokenizer_wrapper.encode(full_prompt)
+    input_ids = torch.tensor([tokens], device=DEVICE)
+    try:
+        with torch.no_grad():
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=dyn_max_tokens,
+                temperature=dyn_temperature,
+                top_k=req.top_k,
+                top_p=req.top_p,
+                repetition_penalty=req.repetition_penalty
+            )
+        gen_tokens = output_ids[0, len(tokens):].tolist()
+        safe_tokens = [
+            t for t in gen_tokens
+            if 0 <= t < VOCAB_SIZE and t != tokenizer_wrapper.eos_id()
+        ]
+        response = tokenizer_wrapper.decode(safe_tokens).strip()
+        if "###" in response:
+            response = response.split("###")[0].strip()
+        # Aplicar limpieza inteligente a la respuesta
+        response = clean_response(response)
+        return {
+            "reply": response,
+            "tokens_generated": len(safe_tokens),
+            "model": "MTP"
+        }
+    except Exception as e:
+        print(f"❌ Error durante generación: {e}")
+        return {
+            "reply": "Lo siento, ocurrió un error al procesar tu solicitud.",
+            "error": str(e)
+        }
+    finally:
+        ACTIVE_REQUESTS -= 1
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+# ======================
+# ENDPOINTS DE INFORMACIÓN
+# ======================
+@app.get("/health")
+def health_check():
+    return {
+        "status": "healthy",
+        "model": "MTP",
+        "device": DEVICE,
+        "active_requests": ACTIVE_REQUESTS,
+        "vocab_size": VOCAB_SIZE
+    }
+@app.get("/info")
+def model_info():
+    return {
+        "model_name": "MTP",
+        "version": "1.0",
+        "architecture": config,
+        "parameters": sum(p.numel() for p in model.parameters()),
+        "device": DEVICE
+    }
+# ======================
+# INTERFAZ WEB (MODERNA CON LOGO INTEGRADO)
+# ======================
+@app.get("/", response_class=HTMLResponse)
+def chat_ui():
+    return """
+<!DOCTYPE html>
+<html lang="es">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
+<title>MTP - Asistente IA</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&display=swap" rel="stylesheet">
+<style>
+:root {
+    --bg-color: #131314;
+    --surface-color: #1E1F20;
+    --accent-color: #4a9eff;
+    --text-primary: #e3e3e3;
+    --text-secondary: #9aa0a6;
+    --user-bubble: #282a2c;
+}
+* { box-sizing: border-box; outline: none; -webkit-tap-highlight-color: transparent; }
+body {
+    margin: 0;
+    background-color: var(--bg-color);
+    font-family: 'Inter', sans-serif;
+    color: var(--text-primary);
+    height: 100dvh;
+    display: flex;
+    flex-direction: column;
+    overflow: hidden;
+}
+header {
+    padding: 12px 20px;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    background: rgba(19, 19, 20, 0.85);
+    backdrop-filter: blur(12px);
+    position: fixed;
+    top: 0;
+    width: 100%;
+    z-index: 50;
+    border-bottom: 1px solid rgba(255,255,255,0.05);
+}
+.brand-wrapper {
+    display: flex;
+    align-items: center;
+    gap: 12px;
+    cursor: pointer;
+}
+.brand-logo {
+    width: 32px;
+    height: 32px;
+    border-radius: 50%;
+    background-image: url('https://i.postimg.cc/c4BRhSnR/8F838209-6DD9-4E1C-96BB-621EC3B78E68.png');
+    background-size: cover;
+    background-position: center;
+    background-repeat: no-repeat;
+    border: 1px solid rgba(255,255,255,0.1);
+}
+.brand-text {
+    font-weight: 500;
+    font-size: 1.05rem;
+    display: flex;
+    align-items: center;
+    gap: 8px;
+}
+.version-badge {
+    font-size: 0.75rem;
+    background: rgba(74, 158, 255, 0.15);
+    color: #8ab4f8;
+    padding: 2px 8px;
+    border-radius: 12px;
+    font-weight: 600;
+}
+.chat-scroll {
+    flex: 1;
+    overflow-y: auto;
+    padding: 80px 20px 40px 20px;
+    display: flex;
+    flex-direction: column;
+    gap: 30px;
+    max-width: 850px;
+    margin: 0 auto;
+    width: 100%;
+    scroll-behavior: smooth;
+}
+.msg-row {
+    display: flex;
+    gap: 16px;
+    width: 100%;
+    opacity: 0;
+    transform: translateY(10px);
+    animation: slideUpFade 0.4s cubic-bezier(0.2, 0.8, 0.2, 1) forwards;
+}
+.msg-row.user { justify-content: flex-end; }
+.msg-row.bot { justify-content: flex-start; align-items: flex-start; }
+.msg-content {
+    line-height: 1.6;
+    font-size: 1rem;
+    word-wrap: break-word;
+    max-width: 85%;
+}
+.user .msg-content {
+    background-color: var(--user-bubble);
+    padding: 10px 18px;
+    border-radius: 18px;
+    border-top-right-radius: 4px;
+    color: #fff;
+}
+.bot .msg-content-wrapper {
+    display: flex;
+    flex-direction: column;
+    gap: 8px;
+    width: 100%;
+}
+.bot .msg-text {
+    padding-top: 6px;
+    color: var(--text-primary);
+}
+.bot-avatar {
+    width: 34px;
+    height: 34px;
+    min-width: 34px;
+    border-radius: 50%;
+    background-image: url('https://i.postimg.cc/c4BRhSnR/8F838209-6DD9-4E1C-96BB-621EC3B78E68.png');
+    background-size: cover;
+    background-position: center;
+    background-repeat: no-repeat;
+    box-shadow: 0 2px 6px rgba(0,0,0,0.2);
+}
+.bot-actions {
+    display: flex;
+    gap: 10px;
+    opacity: 0;
+    transition: opacity 0.3s;
+    margin-top: 5px;
+}
+.action-btn {
+    background: transparent;
+    border: none;
+    color: var(--text-secondary);
+    cursor: pointer;
+    padding: 4px;
+    border-radius: 4px;
+    display: flex;
+    align-items: center;
+    transition: color 0.2s, background 0.2s;
+}
+.action-btn:hover {
+    color: var(--text-primary);
+    background: rgba(255,255,255,0.08);
+}
+.action-btn svg { width: 16px; height: 16px; fill: currentColor; }
+.typing-cursor::after {
+    content: '▊';
+    display: inline-block;
+    margin-left: 2px;
+    animation: blink 1s infinite;
+}
+.footer-container {
+    padding: 0 20px 20px 20px;
+    background: linear-gradient(to top, var(--bg-color) 85%, transparent);
+    position: relative;
+    z-index: 60;
+}
+.input-box {
+    max-width: 850px;
+    margin: 0 auto;
+    background: var(--surface-color);
+    border-radius: 28px;
+    padding: 8px 10px 8px 20px;
+    display: flex;
+    align-items: center;
+    border: 1px solid rgba(255,255,255,0.1);
+    transition: border-color 0.2s, box-shadow 0.2s;
+}
+.input-box:focus-within {
+    border-color: rgba(74, 158, 255, 0.5);
+    box-shadow: 0 0 0 2px rgba(74, 158, 255, 0.1);
+}
+#userInput {
+    flex: 1;
+    background: transparent;
+    border: none;
+    color: white;
+    font-size: 1rem;
+    font-family: inherit;
+    padding: 10px 0;
+}
+#mainBtn {
+    background: white;
+    color: black;
+    border: none;
+    width: 36px;
+    height: 36px;
+    border-radius: 50%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    cursor: pointer;
+    margin-left: 8px;
+    transition: transform 0.2s;
+}
+#mainBtn:hover { transform: scale(1.05); }
+.disclaimer {
+    text-align: center;
+    font-size: 0.75rem;
+    color: #666;
+    margin-top: 12px;
+}
+@keyframes slideUpFade {
+    from { opacity: 0; transform: translateY(15px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+@keyframes blink { 0%, 100% { opacity: 1; } 50% { opacity: 0; } }
+@keyframes pulseAvatar {
+    0% { box-shadow: 0 0 0 0 rgba(74, 158, 255, 0.4); }
+    70% { box-shadow: 0 0 0 8px rgba(74, 158, 255, 0); }
+    100% { box-shadow: 0 0 0 0 rgba(74, 158, 255, 0); }
+}
+.pulsing { animation: pulseAvatar 1.5s infinite; }
+::-webkit-scrollbar { width: 8px; }
+::-webkit-scrollbar-track { background: transparent; }
+::-webkit-scrollbar-thumb { background: #333; border-radius: 4px; }
+</style>
+</head>
+<body>
+<header>
+    <div class="brand-wrapper" onclick="location.reload()">
+        <div class="brand-logo"></div>
+        <div class="brand-text">
+            MTP <span class="version-badge">v1</span>
+        </div>
+    </div>
+</header>
+<div id="chatScroll" class="chat-scroll">
+    <div class="msg-row bot" style="animation-delay: 0.1s;">
+        <div class="bot-avatar"></div>
+        <div class="msg-content-wrapper">
+            <div class="msg-text">
+                ¡Hola! Soy MTP, tu asistente de IA. ¿En qué puedo ayudarte hoy?
+            </div>
+        </div>
+    </div>
+</div>
+<div class="footer-container">
+    <div class="input-box">
+        <input type="text" id="userInput" placeholder="Escribe un mensaje..." autocomplete="off">
+        <button id="mainBtn" onclick="handleBtnClick()">➤</button>
+    </div>
+    <div class="disclaimer">
+        MTP puede cometer errores. Considera verificar la información importante.
+    </div>
+</div>
+<script>
+const chatScroll = document.getElementById('chatScroll');
+const userInput = document.getElementById('userInput');
+const mainBtn = document.getElementById('mainBtn');
+let isGenerating = false;
+let abortController = null;
+let typingTimeout = null;
+let lastUserPrompt = "";
+function scrollToBottom() {
+    chatScroll.scrollTop = chatScroll.scrollHeight;
+}
+function setBtnState(state) {
+    if (state === 'sending') {
+        mainBtn.innerHTML = '⏹';
+        isGenerating = true;
+    } else {
+        mainBtn.innerHTML = '➤';
+        isGenerating = false;
+        abortController = null;
+    }
+}
+function handleBtnClick() {
+    if (isGenerating) {
+        stopGeneration();
+    } else {
+        sendMessage();
+    }
+}
+function stopGeneration() {
+    if (abortController) abortController.abort();
+    if (typingTimeout) clearTimeout(typingTimeout);
+    const activeCursor = document.querySelector('.typing-cursor');
+    if (activeCursor) activeCursor.classList.remove('typing-cursor');
+    const activeAvatar = document.querySelector('.pulsing');
+    if (activeAvatar) activeAvatar.classList.remove('pulsing');
+    setBtnState('idle');
+    userInput.focus();
+}
+async function sendMessage(textOverride = null) {
+    const text = textOverride || userInput.value.trim();
+    if (!text) return;
+    lastUserPrompt = text;
+    if (!textOverride) {
+        userInput.value = '';
+        addMessage(text, 'user');
+    }
+    setBtnState('sending');
+    abortController = new AbortController();
+    const botRow = document.createElement('div');
+    botRow.className = 'msg-row bot';
+    const avatar = document.createElement('div');
+    avatar.className = 'bot-avatar pulsing';
+    const wrapper = document.createElement('div');
+    wrapper.className = 'msg-content-wrapper';
+    const msgText = document.createElement('div');
+    msgText.className = 'msg-text';
+    wrapper.appendChild(msgText);
+    botRow.appendChild(avatar);
+    botRow.appendChild(wrapper);
+    chatScroll.appendChild(botRow);
+    scrollToBottom();
+    try {
+        const response = await fetch('/generate', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ text: text }),
+            signal: abortController.signal
+        });
+        const data = await response.json();
+        if (!isGenerating) return;
+        avatar.classList.remove('pulsing');
+        const reply = data.reply || "No entendí eso.";
+        await typeWriter(msgText, reply);
+        if (isGenerating) {
+            addActions(wrapper, reply);
+            setBtnState('idle');
+        }
+    } catch (error) {
+        if (error.name === 'AbortError') {
+            msgText.textContent += " [Detenido]";
+        } else {
+            avatar.classList.remove('pulsing');
+            msgText.textContent = "Error de conexión.";
+            msgText.style.color = "#ff8b8b";
+            setBtnState('idle');
+        }
+    }
+}
+function addMessage(text, sender) {
+    const row = document.createElement('div');
+    row.className = `msg-row ${sender}`;
+    const content = document.createElement('div');
+    content.className = 'msg-content';
+    content.textContent = text;
+    row.appendChild(content);
+    chatScroll.appendChild(row);
+    scrollToBottom();
+}
+function typeWriter(element, text, speed = 12) {
+    return new Promise(resolve => {
+        let i = 0;
+        element.classList.add('typing-cursor');
+        function type() {
+            if (!isGenerating) {
+                element.classList.remove('typing-cursor');
+                resolve();
+                return;
+            }
+            if (i < text.length) {
+                element.textContent += text.charAt(i);
+                i++;
+                scrollToBottom();
+                typingTimeout = setTimeout(type, speed + Math.random() * 5);
+            } else {
+                element.classList.remove('typing-cursor');
+                resolve();
+            }
+        }
+        type();
+    });
+}
+function addActions(wrapperElement, textToCopy) {
+    const actionsDiv = document.createElement('div');
+    actionsDiv.className = 'bot-actions';
+    const copyBtn = document.createElement('button');
+    copyBtn.className = 'action-btn';
+    copyBtn.innerHTML = `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="9" y="9" width="13" height="13" rx="2" ry="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg>`;
+    copyBtn.onclick = () => {
+        navigator.clipboard.writeText(textToCopy);
+    };
+    const regenBtn = document.createElement('button');
+    regenBtn.className = 'action-btn';
+    regenBtn.innerHTML = `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M23 4v6h-6"></path><path d="M1 20v-6h6"></path><path d="M3.51 9a9 9 0 0 1 14.85-3.36L23 10M1 14l4.64 4.36A9 9 0 0 0 20.49 15"></path></svg>`;
+    regenBtn.onclick = () => {
+        sendMessage(lastUserPrompt);
+    };
+    actionsDiv.appendChild(copyBtn);
+    actionsDiv.appendChild(regenBtn);
+    wrapperElement.appendChild(actionsDiv);
+    requestAnimationFrame(() => actionsDiv.style.opacity = "1");
+    scrollToBottom();
+}
+userInput.addEventListener('keydown', (e) => {
+    if (e.key === 'Enter') handleBtnClick();
+});
+window.onload = () => userInput.focus();
+</script>
+</body>
+</html>
+"""
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    print(f"\n🚀 Iniciando servidor MTP en puerto {port}...")
+    print(f"🌐 Interfaz web: http://0.0.0.0:{port}")
+    print(f"📡 API docs: http://0.0.0.0:{port}/docs")
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=port,
+        log_level="info"
+    )