import os import sys import torch import pickle from fastapi import FastAPI from fastapi.responses import HTMLResponse from pydantic import BaseModel from huggingface_hub import snapshot_download import uvicorn # ====================== # CONFIGURACIÓN DE DISPOSITIVO (GPU/CPU) # ====================== # Detectar automáticamente si hay una GPU NVIDIA disponible if torch.cuda.is_available(): DEVICE = "cuda" print("✅ GPU NVIDIA detectada. Usando CUDA.") else: DEVICE = "cpu" print("⚠️ GPU no detectada. Usando CPU (puede ser más lento).") MODEL_REPO = "teszenofficial/mtptz" # ====================== # DESCARGA DEL MODELO # ====================== print(f"--- SISTEMA MTP 1.1 ---") print(f"Descargando/Verificando modelo desde {MODEL_REPO}...") repo_path = snapshot_download( repo_id=MODEL_REPO, repo_type="model", local_dir="mtptz_repo" ) sys.path.insert(0, repo_path) try: from model import MTPMiniModel from tokenizer import MTPTokenizer except ImportError: print("Advertencia: Verifica la estructura de archivos del modelo.") pass # ====================== # CARGA DEL MODELO # ====================== print("Cargando modelo en memoria...") with open(os.path.join(repo_path, "mtp_mini.pkl"), "rb") as f: model_data = pickle.load(f) tokenizer = MTPTokenizer( os.path.join(repo_path, "mtp_tokenizer.model") ) config = model_data["config"] model = MTPMiniModel( vocab_size=model_data["vocab_size"], d_model=config["model"]["d_model"], n_layers=config["model"]["n_layers"], n_heads=config["model"]["n_heads"], d_ff=config["model"]["d_ff"], max_seq_len=config["model"]["max_seq_len"], dropout=0.0 ) # Cargar pesos y mover a GPU model.load_state_dict(model_data["model_state_dict"]) model.to(DEVICE) model.eval() print(f"🚀 MTP 1.1 listo y corriendo en: {DEVICE.upper()}") # ====================== # API FASTAPI # ====================== app = FastAPI(title="MTP 1.1 API") class Prompt(BaseModel): text: str @app.post("/generate") def generate(prompt: Prompt): user_input = prompt.text.strip() if not user_input: return {"reply": ""} full_prompt = f"### Instrucción:\n{user_input}\n\n### Respuesta:\n" tokens = [tokenizer.bos_id()] + tokenizer.encode(full_prompt) # IMPORTANTE: Mover los inputs también a la GPU input_ids = torch.tensor([tokens], device=DEVICE) with torch.no_grad(): output_ids = model.generate( input_ids, max_new_tokens=150, temperature=0.7, top_k=50, top_p=0.9 ) gen_tokens = output_ids[0, len(tokens):].tolist() if tokenizer.eos_id() in gen_tokens: gen_tokens = gen_tokens[:gen_tokens.index(tokenizer.eos_id())] response = tokenizer.decode(gen_tokens).strip() if "###" in response: response = response.split("###")[0].strip() return {"reply": response} # ====================== # INTERFAZ WEB (FRONTEND MEJORADO) # ====================== @app.get("/", response_class=HTMLResponse) def chat_ui(): return """