asddsad / app.py
Chompi10's picture
Update app.py
2980a05 verified
# ==============================
# IMPORTS
# ==============================
import os
import warnings
from flask import Flask, request, Response # Servidor web y streaming
from transformers import AutoTokenizer, AutoModel, TextIteratorStreamer # Modelo IA
import torch # Motor de ejecución del modelo
import threading # Para ejecutar el modelo en segundo plano
import json # Para manejar datos JSON
# ==============================
# CONFIGURACIÓN DEL MODELO
# ==============================
# Load model directly
model = AutoModel.from_pretrained("unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF", dtype="auto")
#MODEL_NAME = "microsoft/phi-2" # Modelo que vamos a usar
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Descarga el tokenizador
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 # Usa GPU si existe
)
device = "cuda" if torch.cuda.is_available() else "cpu" # Detecta GPU
model.to(device) # Mueve el modelo al dispositivo
# ==============================
# CREAR SERVIDOR FLASK
# ==============================
app = Flask(__name__) # Inicializa el servidor
# ==============================
# FUNCION STREAMING IA
# ==============================
def generate_stream(prompt):
"""
Genera texto en streaming token por token
"""
inputs = tokenizer(prompt, return_tensors="pt").to(device) # Convierte texto en tensores
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True, # No repite el prompt
skip_special_tokens=False # Quita tokens especiales
)
# Ejecuta el modelo en segundo plano
thread = threading.Thread(
target=model.generate,
kwargs={
"inputs": inputs["input_ids"], # Texto convertido
"attention_mask": inputs["attention_mask"],
"max_new_tokens": 300, # Máximo de tokens a generar
"temperature": 0.5, # Creatividad
"top_p": 0.5, # Diversidad
"do_sample": False, # Activa aleatoriedad
"streamer": streamer # Activa streaming
}
)
thread.start() # Inicia generación
# Devuelve token por token en tiempo real
for new_text in streamer:
yield new_text
# ==============================
# API CHAT (POST /chat)
# ==============================
@app.route("/chat", methods=["POST"])
def chat():
"""
Endpoint que recibe mensaje y responde en streaming
"""
data = request.json # Lee JSON enviado
user_message = data.get("message", "") # Extrae mensaje
# Prompt multi-lenguaje
prompt = f"""
You are a professional AI assistant.
Detect the language of the user automatically and answer in the same language.
Be clear and structured.
User: {user_message}
Assistant:
"""
return Response(
generate_stream(prompt),
mimetype="text/plain" # Streaming tipo texto
)
# ==============================
# FRONTEND CHAT ESTILO CHATGPT
# ==============================
@app.route("/")
def index():
"""
Devuelve HTML completo del chat
"""
return """
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>AI Chat</title>
<style>
body {
margin:0;
font-family: Arial;
background-color:#343541;
color:white;
display:flex;
flex-direction:column;
height:100vh;
}
#chat {
flex:1;
padding:20px;
overflow-y:auto;
}
.message {
margin-bottom:15px;
padding:10px 15px;
border-radius:10px;
max-width:70%;
white-space:pre-wrap;
}
.user {
background:#0b93f6;
align-self:flex-end;
}
.bot {
background:#444654;
align-self:flex-start;
}
#input-area {
display:flex;
padding:15px;
background:#202123;
}
input {
flex:1;
padding:10px;
border-radius:5px;
border:none;
font-size:16px;
}
button {
margin-left:10px;
padding:10px 20px;
border:none;
border-radius:5px;
background:#19c37d;
color:white;
font-weight:bold;
cursor:pointer;
}
</style>
</head>
<body>
<div id="chat"></div>
<div id="input-area">
<input id="message" placeholder="Escribe tu mensaje..." />
<button onclick="send()">Enviar</button>
</div>
<script>
async function send() {
const input = document.getElementById("message");
const text = input.value;
if (!text) return;
input.value = "";
const chat = document.getElementById("chat");
// Mostrar mensaje usuario
const userDiv = document.createElement("div");
userDiv.className = "message user";
userDiv.textContent = text;
chat.appendChild(userDiv);
// Crear mensaje bot vacío
const botDiv = document.createElement("div");
botDiv.className = "message bot";
botDiv.textContent = "";
chat.appendChild(botDiv);
chat.scrollTop = chat.scrollHeight;
// Enviar al backend
const response = await fetch("/chat", {
method:"POST",
headers:{"Content-Type":"application/json"},
body: JSON.stringify({message:text})
});
const reader = response.body.getReader();
const decoder = new TextDecoder();
// Streaming en tiempo real
while (true) {
const {done, value} = await reader.read();
if (done) break;
botDiv.textContent += decoder.decode(value);
chat.scrollTop = chat.scrollHeight;
}
}
</script>
</body>
</html>
"""
# ==============================
# INICIAR SERVIDOR
# ==============================
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port)