Spaces:

Chompi10
/

asddsad

Runtime error

File size: 5,624 Bytes

# ==============================
# IMPORTS
# ==============================
import os
import warnings
from flask import Flask, request, Response  # Servidor web y streaming
from transformers import AutoTokenizer, AutoModel, TextIteratorStreamer  # Modelo IA
import torch  # Motor de ejecución del modelo
import threading  # Para ejecutar el modelo en segundo plano
import json  # Para manejar datos JSON


# ==============================
# CONFIGURACIÓN DEL MODELO
# ==============================
# Load model directly

model = AutoModel.from_pretrained("unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF", dtype="auto")
#MODEL_NAME = "microsoft/phi-2"  # Modelo que vamos a usar

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # Descarga el tokenizador
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32  # Usa GPU si existe
)

device = "cuda" if torch.cuda.is_available() else "cpu"  # Detecta GPU
model.to(device)  # Mueve el modelo al dispositivo



# ==============================
# CREAR SERVIDOR FLASK
# ==============================

app = Flask(__name__)  # Inicializa el servidor


# ==============================
# FUNCION STREAMING IA
# ==============================

def generate_stream(prompt):
    """
    Genera texto en streaming token por token
    """

    inputs = tokenizer(prompt, return_tensors="pt").to(device)  # Convierte texto en tensores

    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,  # No repite el prompt
        skip_special_tokens=False  # Quita tokens especiales
    )

    # Ejecuta el modelo en segundo plano
    thread = threading.Thread(
        target=model.generate,
        kwargs={
            "inputs": inputs["input_ids"],  # Texto convertido
            "attention_mask": inputs["attention_mask"],
            "max_new_tokens": 300,  # Máximo de tokens a generar
            "temperature": 0.5,  # Creatividad
            "top_p": 0.5,  # Diversidad
            "do_sample": False,  # Activa aleatoriedad
            "streamer": streamer  # Activa streaming
        }
    )

    thread.start()  # Inicia generación

    # Devuelve token por token en tiempo real
    for new_text in streamer:
        yield new_text


# ==============================
# API CHAT (POST /chat)
# ==============================

@app.route("/chat", methods=["POST"])
def chat():
    """
    Endpoint que recibe mensaje y responde en streaming
    """

    data = request.json  # Lee JSON enviado
    user_message = data.get("message", "")  # Extrae mensaje

    # Prompt multi-lenguaje
    prompt = f"""
You are a professional AI assistant.
Detect the language of the user automatically and answer in the same language.
Be clear and structured.

User: {user_message}
Assistant:
"""

    return Response(
        generate_stream(prompt),
        mimetype="text/plain"  # Streaming tipo texto
    )


# ==============================
# FRONTEND CHAT ESTILO CHATGPT
# ==============================

@app.route("/")
def index():
    """
    Devuelve HTML completo del chat
    """

    return """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>AI Chat</title>

<style>
body {
    margin:0;
    font-family: Arial;
    background-color:#343541;
    color:white;
    display:flex;
    flex-direction:column;
    height:100vh;
}

#chat {
    flex:1;
    padding:20px;
    overflow-y:auto;
}

.message {
    margin-bottom:15px;
    padding:10px 15px;
    border-radius:10px;
    max-width:70%;
    white-space:pre-wrap;
}

.user {
    background:#0b93f6;
    align-self:flex-end;
}

.bot {
    background:#444654;
    align-self:flex-start;
}

#input-area {
    display:flex;
    padding:15px;
    background:#202123;
}

input {
    flex:1;
    padding:10px;
    border-radius:5px;
    border:none;
    font-size:16px;
}

button {
    margin-left:10px;
    padding:10px 20px;
    border:none;
    border-radius:5px;
    background:#19c37d;
    color:white;
    font-weight:bold;
    cursor:pointer;
}
</style>
</head>

<body>

<div id="chat"></div>

<div id="input-area">
    <input id="message" placeholder="Escribe tu mensaje..." />
    <button onclick="send()">Enviar</button>
</div>

<script>

async function send() {

    const input = document.getElementById("message");
    const text = input.value;
    if (!text) return;

    input.value = "";

    const chat = document.getElementById("chat");

    // Mostrar mensaje usuario
    const userDiv = document.createElement("div");
    userDiv.className = "message user";
    userDiv.textContent = text;
    chat.appendChild(userDiv);

    // Crear mensaje bot vacío
    const botDiv = document.createElement("div");
    botDiv.className = "message bot";
    botDiv.textContent = "";
    chat.appendChild(botDiv);

    chat.scrollTop = chat.scrollHeight;

    // Enviar al backend
    const response = await fetch("/chat", {
        method:"POST",
        headers:{"Content-Type":"application/json"},
        body: JSON.stringify({message:text})
    });

    const reader = response.body.getReader();
    const decoder = new TextDecoder();

    // Streaming en tiempo real
    while (true) {
        const {done, value} = await reader.read();
        if (done) break;
        botDiv.textContent += decoder.decode(value);
        chat.scrollTop = chat.scrollHeight;
    }
}

</script>

</body>
</html>
"""


# ==============================
# INICIAR SERVIDOR
# ==============================

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    app.run(host="0.0.0.0", port=port)