# ============================== # IMPORTS # ============================== import os import warnings from flask import Flask, request, Response # Servidor web y streaming from transformers import AutoTokenizer, AutoModel, TextIteratorStreamer # Modelo IA import torch # Motor de ejecución del modelo import threading # Para ejecutar el modelo en segundo plano import json # Para manejar datos JSON # ============================== # CONFIGURACIÓN DEL MODELO # ============================== # Load model directly model = AutoModel.from_pretrained("unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF", dtype="auto") #MODEL_NAME = "microsoft/phi-2" # Modelo que vamos a usar tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Descarga el tokenizador model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 # Usa GPU si existe ) device = "cuda" if torch.cuda.is_available() else "cpu" # Detecta GPU model.to(device) # Mueve el modelo al dispositivo # ============================== # CREAR SERVIDOR FLASK # ============================== app = Flask(__name__) # Inicializa el servidor # ============================== # FUNCION STREAMING IA # ============================== def generate_stream(prompt): """ Genera texto en streaming token por token """ inputs = tokenizer(prompt, return_tensors="pt").to(device) # Convierte texto en tensores streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, # No repite el prompt skip_special_tokens=False # Quita tokens especiales ) # Ejecuta el modelo en segundo plano thread = threading.Thread( target=model.generate, kwargs={ "inputs": inputs["input_ids"], # Texto convertido "attention_mask": inputs["attention_mask"], "max_new_tokens": 300, # Máximo de tokens a generar "temperature": 0.5, # Creatividad "top_p": 0.5, # Diversidad "do_sample": False, # Activa aleatoriedad "streamer": streamer # Activa streaming } ) thread.start() # Inicia generación # Devuelve token por token en tiempo real for new_text in streamer: yield new_text # ============================== # API CHAT (POST /chat) # ============================== @app.route("/chat", methods=["POST"]) def chat(): """ Endpoint que recibe mensaje y responde en streaming """ data = request.json # Lee JSON enviado user_message = data.get("message", "") # Extrae mensaje # Prompt multi-lenguaje prompt = f""" You are a professional AI assistant. Detect the language of the user automatically and answer in the same language. Be clear and structured. User: {user_message} Assistant: """ return Response( generate_stream(prompt), mimetype="text/plain" # Streaming tipo texto ) # ============================== # FRONTEND CHAT ESTILO CHATGPT # ============================== @app.route("/") def index(): """ Devuelve HTML completo del chat """ return """