""" Modal LLM inference endpoint for Retro Alpha. Serves the fine-tuned Nemotron-3 GGUF on A10G GPU for fast inference. Kept warm (scaledown_window=3600) to avoid cold starts. Usage: modal serve modal_app.py # local tunnel for testing modal deploy modal_app.py # deploy to Modal cloud After deployment, set MODAL_INFERENCE_URL to the returned URL in .env. """ import os import modal from fastapi import FastAPI, Request from fastapi.responses import JSONResponse MODEL_REPO = os.getenv("MODEL_REPO", "sankalphs/retro-alpha-nemotron-gguf") MODEL_FILE = os.getenv("MODEL_FILE", "NVIDIA-Nemotron-3-Nano-4B.Q4_K_M.gguf") MODEL_DIR = "/models" image = ( modal.Image.debian_slim(python_version="3.11") .apt_install("git", "curl", "cmake", "build-essential", "libgomp1", "libgfortran5") .pip_install( "fastapi", "llama-cpp-python", "python-dotenv", "huggingface_hub", ) .env({ "CMAKE_ARGS": "-DGGML_CUDA=on", "FORCE_CMAKE": "1", }) .run_commands("mkdir -p " + MODEL_DIR) ) app = modal.App("retro-alpha-inference", image=image) hf_secret = modal.Secret.from_name("huggingface-secret") @app.cls(gpu="A10G", scaledown_window=3600, secrets=[hf_secret]) class Nemotron: @modal.enter() def load(self): import time from pathlib import Path local_path = Path(MODEL_DIR) / MODEL_FILE if local_path.exists() and local_path.stat().st_size > 100_000_000: size_gb = local_path.stat().st_size / 1e9 print(f"Model cached: {local_path} ({size_gb:.2f} GB)") else: from huggingface_hub import hf_hub_download for attempt in range(1, 4): try: print(f"Downloading {MODEL_FILE} from {MODEL_REPO} (attempt {attempt}/3)...") hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR, local_dir_use_symlinks=False, ) break except Exception as e: print(f"Download attempt {attempt} failed: {e}") if attempt < 3: time.sleep(2 ** attempt) from llama_cpp import Llama n_gpu = int(os.getenv("LLAMA_GPU_LAYERS", "-1")) print(f"Loading {local_path} (n_gpu_layers={n_gpu}, n_ctx=2048)...") self.llm = Llama( model_path=str(local_path), n_ctx=int(os.getenv("LLAMA_CTX", "2048")), n_gpu_layers=n_gpu, verbose=False, ) print("Model loaded.") def _build_app(self): web_app = FastAPI() @web_app.post("/chat") async def chat(request: Request): data = await request.json() messages = data.get("messages", []) max_tokens = int(data.get("max_tokens", 256)) temperature = float(data.get("temperature", 0.7)) if not messages: return JSONResponse({"error": "No messages provided"}, status_code=400) result = self.llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, ) return result @web_app.get("/health") async def health(): from pathlib import Path local_path = Path(MODEL_DIR) / MODEL_FILE return { "status": "ok" if local_path.exists() else "model_missing", "model_path": str(local_path), "model_size_gb": round(local_path.stat().st_size / 1e9, 2) if local_path.exists() else 0, } return web_app @modal.asgi_app() def fastapi_app(self): return self._build_app()