Spaces:

TeszenAI
/

MTP-3.3.1

Sleeping

App Files Files Community

teszenofficial commited on Apr 11

Commit

d17c293

verified ·

1 Parent(s): 0e2fedd

Create app.py

Browse files

Files changed (1) hide show

app.py +700 -0

app.py ADDED Viewed

	@@ -0,0 +1,700 @@

+import os
+import sys
+import torch
+import json
+import time
+import gc
+import re
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from huggingface_hub import snapshot_download
+import uvicorn
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+import sentencepiece as spm
+if torch.cuda.is_available():
+    DEVICE = "cuda"
+    print("✅ GPU NVIDIA detectada. Usando CUDA.")
+    torch.backends.cudnn.benchmark = True
+else:
+    DEVICE = "cpu"
+    print("⚠️ GPU no detectada. Usando CPU.")
+    if hasattr(torch, '_dynamo'):
+        torch._dynamo.config.suppress_errors = True
+if DEVICE == "cpu":
+    torch.set_num_threads(max(1, os.cpu_count() // 2))
+torch.set_grad_enabled(False)
+MODEL_REPO = "TeszenAI/MTP-3.3.1"
+class LayerNorm(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(d_model))
+        self.bias = nn.Parameter(torch.zeros(d_model))
+        self.eps = eps
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.weight * (x - mean) / (std + self.eps) + self.bias
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.d_k = d_model // n_heads
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        self.w_o = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.scale = math.sqrt(self.d_k)
+    def forward(self, x, mask=None):
+        batch_size, seq_len, _ = x.shape
+        Q = self.w_q(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        K = self.w_k(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        V = self.w_v(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        attn_output = torch.matmul(attn_weights, V)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
+        return self.w_o(attn_output)
+class FeedForward(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff)
+        self.linear2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.linear2(self.dropout(F.gelu(self.linear1(x))))
+class TransformerBlock(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
+        self.feed_forward = FeedForward(d_model, d_ff, dropout)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        attn_output = self.attention(x, mask)
+        x = x + self.dropout1(attn_output)
+        x = self.norm1(x)
+        ff_output = self.feed_forward(x)
+        x = x + self.dropout2(ff_output)
+        x = self.norm2(x)
+        return x
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, max_len: int = 5000):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1), :]
+class MTPModel(nn.Module):
+    def __init__(self, vocab_size: int, d_model: int = 512, n_heads: int = 8,
+                 n_layers: int = 8, d_ff: int = 2048, dropout: float = 0.1, max_len: int = 1024):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.max_len = max_len
+        self.token_embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoding = PositionalEncoding(d_model, max_len)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)
+        ])
+        self.norm = LayerNorm(d_model)
+        self.lm_head = nn.Linear(d_model, vocab_size)
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.tril(torch.ones(x.size(1), x.size(1))).unsqueeze(0).unsqueeze(0).to(x.device)
+        x = self.token_embedding(x) * math.sqrt(self.d_model)
+        x = self.pos_encoding(x)
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+        return self.lm_head(x)
+    @torch.inference_mode()
+    def generate(self, input_ids, max_new_tokens=200, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.15):
+        generated = input_ids
+        past_key_values = None
+        for _ in range(max_new_tokens):
+            logits = self(generated)
+            next_logits = logits[0, -1, :] / temperature
+            if repetition_penalty != 1.0:
+                unique_tokens = set(generated[0].tolist()[-50:])
+                for token_id in unique_tokens:
+                    next_logits[token_id] /= repetition_penalty
+            if top_k > 0:
+                top_k_val = min(top_k, next_logits.size(-1))
+                indices_to_remove = next_logits < torch.topk(next_logits, top_k_val)[0][..., -1, None]
+                next_logits[indices_to_remove] = float('-inf')
+            if top_p < 1.0 and top_p > 0.0:
+                sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                next_logits[indices_to_remove] = float('-inf')
+            probs = F.softmax(next_logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1).item()
+            if next_token == 2 or next_token == 3:
+                break
+            generated = torch.cat([generated, torch.tensor([[next_token]], device=generated.device)], dim=1)
+        return generated
+print(f"📦 Descargando modelo desde {MODEL_REPO}...")
+repo_path = snapshot_download(
+    repo_id=MODEL_REPO,
+    repo_type="model",
+    local_dir="mtp_repo",
+    ignore_patterns=["*.h5", "*.ot", "*.msgpack"]
+)
+config_path = os.path.join(repo_path, "config.json")
+if os.path.exists(config_path):
+    with open(config_path, "r") as f:
+        config = json.load(f)
+else:
+    config = {
+        "vocab_size": 8000,
+        "d_model": 512,
+        "n_heads": 8,
+        "n_layers": 8,
+        "d_ff": 2048,
+        "dropout": 0.1,
+        "max_len": 1024
+    }
+tokenizer_path = os.path.join(repo_path, "mtp_tokenizer.model")
+if not os.path.exists(tokenizer_path):
+    print(f"❌ Tokenizador no encontrado en {tokenizer_path}")
+    sys.exit(1)
+sp = spm.SentencePieceProcessor()
+sp.load(tokenizer_path)
+VOCAB_SIZE = sp.get_piece_size()
+config["vocab_size"] = VOCAB_SIZE
+print(f"🧠 Inicializando modelo MTP...")
+print(f"   → Vocabulario: {VOCAB_SIZE}")
+print(f"   → Dimensión: {config['d_model']}")
+print(f"   → Capas: {config['n_layers']}")
+print(f"   → Heads: {config['n_heads']}")
+model = MTPModel(**config)
+model.to(DEVICE)
+model_path = os.path.join(repo_path, "mtp_model.pt")
+if os.path.exists(model_path):
+    state_dict = torch.load(model_path, map_location=DEVICE)
+    model.load_state_dict(state_dict, strict=False)
+    print("✅ Pesos del modelo cargados")
+else:
+    print(f"⚠️ No se encontró {model_path}, usando pesos aleatorios")
+model.eval()
+if DEVICE == "cuda":
+    model = torch.compile(model, mode="reduce-overhead")
+param_count = sum(p.numel() for p in model.parameters())
+print(f"✅ Modelo cargado: {param_count:,} parámetros ({param_count/1e6:.1f}M)")
+app = FastAPI(title="MTP API", description="API para modelo de lenguaje MTP", version="2.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+class PromptRequest(BaseModel):
+    text: str = Field(..., max_length=2000, description="Texto de entrada")
+    max_tokens: int = Field(default=200, ge=10, le=300, description="Tokens máximos a generar")
+    temperature: float = Field(default=0.7, ge=0.3, le=1.5, description="Temperatura de muestreo")
+    top_k: int = Field(default=60, ge=1, le=100, description="Top-k sampling")
+    top_p: float = Field(default=0.92, ge=0.5, le=1.0, description="Top-p sampling")
+    repetition_penalty: float = Field(default=1.15, ge=1.0, le=2.0, description="Penalización por repetición")
+def build_prompt(user_input: str) -> str:
+    return f"### Instrucción:\n{user_input}\n\n### Respuesta:\n"
+ACTIVE_REQUESTS = 0
+class MTPTokenizer:
+    def __init__(self, sp_model):
+        self.sp = sp_model
+    def encode(self, text):
+        return self.sp.encode(text)
+    def decode(self, tokens):
+        return self.sp.decode(tokens)
+    def bos_id(self):
+        return self.sp.bos_id()
+    def eos_id(self):
+        return self.sp.eos_id()
+    def pad_id(self):
+        return self.sp.pad_id()
+tokenizer_wrapper = MTPTokenizer(sp)
+KNOWLEDGE_BASE = {
+    "inteligencia artificial": "La Inteligencia Artificial es un campo de la computación que crea sistemas capaces de realizar tareas que requieren inteligencia humana, como aprendizaje, razonamiento, percepción y procesamiento de lenguaje natural.",
+    "machine learning": "El Machine Learning o Aprendizaje Automático es una rama de la IA que permite a los sistemas aprender y mejorar desde la experiencia sin ser programados explícitamente, usando algoritmos que identifican patrones en datos.",
+    "redes neuronales": "Las redes neuronales artificiales son sistemas computacionales inspirados en el cerebro humano, compuestos por capas de neuronas artificiales que procesan información para reconocer patrones y hacer predicciones.",
+    "python": "Python es un lenguaje de programación de alto nivel, interpretado y de propósito general, conocido por su sintaxis clara y legible, ideal para ciencia de datos, IA y desarrollo web.",
+    "transformers": "Los Transformers son una arquitectura de deep learning basada en mecanismos de atención que revolucionó el NLP, siendo la base de modelos como GPT, BERT y MTP.",
+    "gpt": "GPT (Generative Pre-trained Transformer) es una familia de modelos de lenguaje desarrollados por OpenAI que generan texto coherente y contextualmente relevante.",
+    "hola": "¡Hola! Soy MTP, tu asistente de IA. ¿En qué puedo ayudarte hoy?",
+    "como estas": "¡Estoy funcionando de manera óptima! Como asistente de IA, siempre estoy listo para ayudarte. ¿En qué puedo asistirte?",
+    "quien eres": "Soy MTP (Mi Transformer Personalizado), un asistente de IA creado con arquitectura Transformer desde cero. Fui entrenado para responder preguntas, mantener conversaciones y ayudarte con diversas tareas.",
+    "que puedes hacer": "Puedo responder preguntas sobre diversos temas, ayudarte con programación, explicar conceptos científicos y tecnológicos, mantener conversaciones, y asistirte en tareas de procesamiento de lenguaje natural.",
+    "gracias": "¡De nada! Fue un placer ayudarte. Si necesitas algo más, aquí estoy. ¡Que tengas un excelente día!",
+    "adios": "¡Hasta luego! Fue un gusto conversar contigo. No dudes en volver si necesitas ayuda. ¡Que tengas un buen día!"
+}
+def get_fallback_response(user_input: str) -> str:
+    user_lower = user_input.lower().strip()
+    for key, response in KNOWLEDGE_BASE.items():
+        if key in user_lower:
+            return response
+    return None
+def clean_response(text: str, user_input: str = "") -> str:
+    if not text:
+        return ""
+    text = re.sub(r'(.)\1{4,}', r'\1\1', text)
+    text = re.sub(r'<unk>', '', text)
+    text = re.sub(r'\[UNK\]', '', text)
+    sentences = re.split(r'[.!?]+', text)
+    if len(sentences) > 3:
+        text = '. '.join(sentences[:3]) + '.'
+    text = re.sub(r'\s+', ' ', text).strip()
+    if len(text) < 5:
+        fallback = get_fallback_response(user_input)
+        if fallback:
+            return fallback
+        return "Lo siento, no pude generar una respuesta clara. ¿Podrías reformular tu pregunta?"
+    return text
+@app.post("/generate")
+async def generate(req: PromptRequest):
+    global ACTIVE_REQUESTS
+    ACTIVE_REQUESTS += 1
+    user_input = req.text.strip()
+    if not user_input:
+        ACTIVE_REQUESTS -= 1
+        return {"reply": "", "tokens_generated": 0}
+    fallback_response = get_fallback_response(user_input)
+    if fallback_response and len(user_input) < 30:
+        ACTIVE_REQUESTS -= 1
+        return {"reply": fallback_response, "tokens_generated": 0, "source": "knowledge_base"}
+    full_prompt = build_prompt(user_input)
+    tokens = tokenizer_wrapper.encode(full_prompt)
+    max_input_tokens = model.max_len - 50
+    if len(tokens) > max_input_tokens:
+        tokens = tokens[-max_input_tokens:]
+    input_ids = torch.tensor([tokens], device=DEVICE)
+    try:
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                max_new_tokens=min(req.max_tokens, 250),
+                temperature=req.temperature,
+                top_k=req.top_k,
+                top_p=req.top_p,
+                repetition_penalty=req.repetition_penalty
+            )
+        gen_tokens = output_ids[0, len(tokens):].tolist()
+        safe_tokens = [t for t in gen_tokens if 0 <= t < VOCAB_SIZE and t not in [0, 1]]
+        if safe_tokens:
+            response = tokenizer_wrapper.decode(safe_tokens).strip()
+        else:
+            response = ""
+        response = clean_response(response, user_input)
+        if len(response) < 5 or response in ["", " ", "No"]:
+            fallback = get_fallback_response(user_input)
+            if fallback:
+                response = fallback
+            else:
+                response = "Entendido. ¿Podrías darme más detalles para ayudarte mejor?"
+        return {
+            "reply": response,
+            "tokens_generated": len(safe_tokens),
+            "model": "MTP-v2"
+        }
+    except Exception as e:
+        print(f"❌ Error durante generación: {e}")
+        fallback = get_fallback_response(user_input)
+        if not fallback:
+            fallback = "Lo siento, ocurrió un error al procesar tu solicitud. Por favor, intenta de nuevo."
+        return {
+            "reply": fallback,
+            "error": str(e)
+        }
+    finally:
+        ACTIVE_REQUESTS -= 1
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+@app.get("/health")
+def health_check():
+    return {
+        "status": "healthy",
+        "model": "MTP",
+        "device": DEVICE,
+        "active_requests": ACTIVE_REQUESTS,
+        "vocab_size": VOCAB_SIZE
+    }
+@app.get("/info")
+def model_info():
+    return {
+        "model_name": "MTP",
+        "version": "2.0",
+        "architecture": {
+            "vocab_size": VOCAB_SIZE,
+            "d_model": config.get("d_model", 512),
+            "n_layers": config.get("n_layers", 8),
+            "n_heads": config.get("n_heads", 8),
+            "max_len": config.get("max_len", 1024)
+        },
+        "parameters": sum(p.numel() for p in model.parameters()),
+        "device": DEVICE
+    }
+@app.get("/", response_class=HTMLResponse)
+def chat_ui():
+    return """
+<!DOCTYPE html>
+<html lang="es">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>MTP - Asistente IA Inteligente</title>
+<style>
+* { margin: 0; padding: 0; box-sizing: border-box; }
+body {
+    background: #0a0a0a;
+    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', sans-serif;
+    height: 100vh;
+    display: flex;
+    flex-direction: column;
+}
+.chat-header {
+    padding: 20px 24px;
+    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+    border-bottom: 1px solid #2a2a4a;
+}
+.chat-header h1 {
+    color: white;
+    font-size: 1.3rem;
+    font-weight: 600;
+    display: flex;
+    align-items: center;
+    gap: 10px;
+}
+.chat-header p {
+    color: #888;
+    font-size: 0.8rem;
+    margin-top: 5px;
+}
+.chat-messages {
+    flex: 1;
+    overflow-y: auto;
+    padding: 24px;
+    display: flex;
+    flex-direction: column;
+    gap: 16px;
+}
+.message {
+    display: flex;
+    gap: 12px;
+    max-width: 85%;
+    animation: fadeIn 0.3s ease;
+}
+@keyframes fadeIn {
+    from { opacity: 0; transform: translateY(10px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+.message.user {
+    align-self: flex-end;
+    flex-direction: row-reverse;
+}
+.message-content {
+    padding: 12px 18px;
+    border-radius: 20px;
+    font-size: 0.95rem;
+    line-height: 1.45;
+    word-wrap: break-word;
+    max-width: 100%;
+}
+.user .message-content {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border-radius: 20px 4px 20px 20px;
+}
+.bot .message-content {
+    background: #1e1e2e;
+    color: #e0e0e0;
+    border-radius: 4px 20px 20px 20px;
+    border: 1px solid #2a2a4a;
+}
+.chat-input-container {
+    padding: 20px 24px;
+    background: #0f0f0f;
+    border-top: 1px solid #1a1a2e;
+}
+.input-wrapper {
+    display: flex;
+    gap: 12px;
+    max-width: 900px;
+    margin: 0 auto;
+}
+#messageInput {
+    flex: 1;
+    padding: 14px 18px;
+    background: #1a1a2e;
+    border: 1px solid #2a2a4a;
+    border-radius: 28px;
+    color: white;
+    font-size: 0.95rem;
+    outline: none;
+    transition: all 0.2s;
+}
+#messageInput:focus {
+    border-color: #667eea;
+    background: #1e1e3a;
+}
+#messageInput::placeholder {
+    color: #666;
+}
+#sendBtn {
+    padding: 14px 28px;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border: none;
+    border-radius: 28px;
+    color: white;
+    font-weight: 600;
+    cursor: pointer;
+    transition: transform 0.1s, opacity 0.2s;
+}
+#sendBtn:hover { opacity: 0.9; transform: scale(1.02); }
+#sendBtn:disabled {
+    opacity: 0.5;
+    transform: none;
+    cursor: not-allowed;
+}
+.typing {
+    display: flex;
+    gap: 6px;
+    padding: 12px 18px;
+}
+.typing span {
+    width: 8px;
+    height: 8px;
+    background: #888;
+    border-radius: 50%;
+    animation: bounce 1.4s infinite ease-in-out;
+}
+.typing span:nth-child(1) { animation-delay: -0.32s; }
+.typing span:nth-child(2) { animation-delay: -0.16s; }
+@keyframes bounce {
+    0%, 80%, 100% { transform: scale(0); }
+    40% { transform: scale(1); }
+}
+.status-badge {
+    display: inline-block;
+    width: 10px;
+    height: 10px;
+    border-radius: 50%;
+    background: #4ade80;
+    margin-right: 8px;
+    animation: pulse 2s infinite;
+}
+@keyframes pulse {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.5; }
+}
+@media (max-width: 768px) {
+    .message { max-width: 95%; }
+    .chat-messages { padding: 16px; }
+    .chat-header { padding: 16px; }
+}
+</style>
+</head>
+<body>
+<div class="chat-header">
+    <h1>
+        <span class="status-badge"></span>
+        🤖 MTP - Asistente IA Inteligente
+    </h1>
+    <p>Modelo Transformer personalizado | Respuestas coherentes y contextuales</p>
+</div>
+<div class="chat-messages" id="chatMessages">
+    <div class="message bot">
+        <div class="message-content">¡Hola! Soy MTP, tu asistente de IA inteligente. Puedo responder preguntas, ayudarte con programación, explicar conceptos y mantener conversaciones. ¿En qué puedo ayudarte hoy?</div>
+    </div>
+</div>
+<div class="chat-input-container">
+    <div class="input-wrapper">
+        <input type="text" id="messageInput" placeholder="Escribe tu mensaje aquí..." autocomplete="off">
+        <button id="sendBtn">Enviar</button>
+    </div>
+</div>
+<script>
+const chatMessages = document.getElementById('chatMessages');
+const messageInput = document.getElementById('messageInput');
+const sendBtn = document.getElementById('sendBtn');
+let isLoading = false;
+function addMessage(text, isUser) {
+    const div = document.createElement('div');
+    div.className = `message ${isUser ? 'user' : 'bot'}`;
+    const escapedText = text.replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/\\n/g, '<br>');
+    div.innerHTML = `<div class="message-content">${escapedText}</div>`;
+    chatMessages.appendChild(div);
+    chatMessages.scrollTop = chatMessages.scrollHeight;
+    return div;
+}
+function addTypingIndicator() {
+    const div = document.createElement('div');
+    div.className = 'message bot';
+    div.id = 'typingIndicator';
+    div.innerHTML = `<div class="typing"><span></span><span></span><span></span></div>`;
+    chatMessages.appendChild(div);
+    chatMessages.scrollTop = chatMessages.scrollHeight;
+}
+function removeTypingIndicator() {
+    const indicator = document.getElementById('typingIndicator');
+    if (indicator) indicator.remove();
+}
+async function sendMessage() {
+    const text = messageInput.value.trim();
+    if (!text || isLoading) return;
+    messageInput.value = '';
+    addMessage(text, true);
+    isLoading = true;
+    sendBtn.disabled = true;
+    addTypingIndicator();
+    try {
+        const response = await fetch('/generate', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                text: text,
+                max_tokens: 200,
+                temperature: 0.7,
+                top_k: 60,
+                top_p: 0.92,
+                repetition_penalty: 1.15
+            })
+        });
+        const data = await response.json();
+        removeTypingIndicator();
+        const reply = data.reply || "Lo siento, no pude generar una respuesta.";
+        addMessage(reply, false);
+    } catch (error) {
+        removeTypingIndicator();
+        addMessage('Error de conexión. Por favor, intenta de nuevo.', false);
+    } finally {
+        isLoading = false;
+        sendBtn.disabled = false;
+        messageInput.focus();
+    }
+}
+messageInput.addEventListener('keypress', (e) => {
+    if (e.key === 'Enter' && !e.shiftKey) {
+        e.preventDefault();
+        sendMessage();
+    }
+});
+sendBtn.addEventListener('click', sendMessage);
+messageInput.focus();
+</script>
+</body>
+</html>
+"""
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    print(f"\n🚀 Iniciando servidor MTP Inteligente en puerto {port}...")
+    print(f"🌐 Interfaz web: http://0.0.0.0:{port}")
+    print(f"📡 API docs: http://0.0.0.0:{port}/docs")
+    print(f"📊 Endpoint POST: http://0.0.0.0:{port}/generate")
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=port,
+        log_level="warning"
+    )