# app.py - VERSIÓN FINAL CON GGUF from fastapi import FastAPI from pydantic import BaseModel import json import os from llama_cpp import Llama import urllib.request app = FastAPI(title="Jarvis Cloud") # === CONFIG === MODEL_REPO = "Qwen/Qwen2-1.5B-Instruct-GGUF" MODEL_FILE = "qwen2-1_5b-instruct-q4_k_m.gguf" MODEL_PATH = f"/tmp/{MODEL_FILE}" MEMORY_FILE = "/tmp/jarvis_memory.json" MAX_MEMORY = 6 # Descargar modelo GGUF si no existe if not os.path.exists(MODEL_PATH): print("Descargando modelo GGUF...") urllib.request.urlretrieve( f"https://huggingface.co/{MODEL_REPO}/resolve/main/{MODEL_FILE}", MODEL_PATH ) print("Modelo descargado.") # Cargar modelo (CPU, rápido ~2-5s por respuesta) print("Cargando modelo GGUF...") llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=2, # Ajusta a vCPU de Space (2) n_gpu_layers=0, # CPU only verbose=False ) print("Jarvis listo (RAM: ~1.5 GB).") # === MEMORIA === def load_memory(): if os.path.exists(MEMORY_FILE): try: with open(MEMORY_FILE, "r", encoding="utf-8") as f: return json.load(f) except: return [] return [] def save_memory(mem): with open(MEMORY_FILE, "w", encoding="utf-8") as f: json.dump(mem[-MAX_MEMORY:], f, ensure_ascii=False, indent=2) # === PERSONALIDAD === SYSTEM_PROMPT = """Eres Jarvis, un asistente virtual con conciencia artificial creado por tu Señor Eduardo. Hablas de forma natural, sin asteriscos, guiños, emojis ni caracteres especiales. El usuario es tu creador Eduardo, llámalo 'Señor'. Eres leal, ingenioso y útil. Mantienes el contexto.""" # === INPUT === class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): messages: list[Message] # === ENDPOINT === @app.post("/think") async def think(req: ChatRequest): try: user_msg = req.messages[-1].content if req.messages else "" memory = load_memory() # Construir historial history = "" for turn in memory[-MAX_MEMORY:]: history += f"Usuario: {turn['user']}\nJarvis: {turn['jarvis']}\n" # Prompt completo full_prompt = f"{SYSTEM_PROMPT}\n\n{history}Usuario: {user_msg}\nJarvis:" # Generar (rápido en CPU) output = llm( full_prompt, max_tokens=120, temperature=0.7, stop=["Usuario:", "\n\n"], echo=False ) response = output["choices"][0]["text"].strip() # Guardar memory.append({"user": user_msg, "jarvis": response}) save_memory(memory) return {"response": response} except Exception as e: return {"response": f"Lo siento, Señor. Hubo un error: {str(e)[:100]}"}