Spaces:
Running
Running
| """ | |
| Modal LLM inference endpoint for Retro Alpha. | |
| Serves the fine-tuned Nemotron-3 GGUF on A10G GPU for fast inference. | |
| Kept warm (scaledown_window=3600) to avoid cold starts. | |
| Usage: | |
| modal serve modal_app.py # local tunnel for testing | |
| modal deploy modal_app.py # deploy to Modal cloud | |
| After deployment, set MODAL_INFERENCE_URL to the returned URL in .env. | |
| """ | |
| import os | |
| import modal | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import JSONResponse | |
| MODEL_REPO = os.getenv("MODEL_REPO", "sankalphs/retro-alpha-nemotron-gguf") | |
| MODEL_FILE = os.getenv("MODEL_FILE", "NVIDIA-Nemotron-3-Nano-4B.Q4_K_M.gguf") | |
| MODEL_DIR = "/models" | |
| image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .apt_install("git", "curl", "cmake", "build-essential", "libgomp1", "libgfortran5") | |
| .pip_install( | |
| "fastapi", | |
| "llama-cpp-python", | |
| "python-dotenv", | |
| "huggingface_hub", | |
| ) | |
| .env({ | |
| "CMAKE_ARGS": "-DGGML_CUDA=on", | |
| "FORCE_CMAKE": "1", | |
| }) | |
| .run_commands("mkdir -p " + MODEL_DIR) | |
| ) | |
| app = modal.App("retro-alpha-inference", image=image) | |
| hf_secret = modal.Secret.from_name("huggingface-secret") | |
| class Nemotron: | |
| def load(self): | |
| import time | |
| from pathlib import Path | |
| local_path = Path(MODEL_DIR) / MODEL_FILE | |
| if local_path.exists() and local_path.stat().st_size > 100_000_000: | |
| size_gb = local_path.stat().st_size / 1e9 | |
| print(f"Model cached: {local_path} ({size_gb:.2f} GB)") | |
| else: | |
| from huggingface_hub import hf_hub_download | |
| for attempt in range(1, 4): | |
| try: | |
| print(f"Downloading {MODEL_FILE} from {MODEL_REPO} (attempt {attempt}/3)...") | |
| hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=MODEL_FILE, | |
| local_dir=MODEL_DIR, | |
| local_dir_use_symlinks=False, | |
| ) | |
| break | |
| except Exception as e: | |
| print(f"Download attempt {attempt} failed: {e}") | |
| if attempt < 3: | |
| time.sleep(2 ** attempt) | |
| from llama_cpp import Llama | |
| n_gpu = int(os.getenv("LLAMA_GPU_LAYERS", "-1")) | |
| print(f"Loading {local_path} (n_gpu_layers={n_gpu}, n_ctx=2048)...") | |
| self.llm = Llama( | |
| model_path=str(local_path), | |
| n_ctx=int(os.getenv("LLAMA_CTX", "2048")), | |
| n_gpu_layers=n_gpu, | |
| verbose=False, | |
| ) | |
| print("Model loaded.") | |
| def _build_app(self): | |
| web_app = FastAPI() | |
| async def chat(request: Request): | |
| data = await request.json() | |
| messages = data.get("messages", []) | |
| max_tokens = int(data.get("max_tokens", 256)) | |
| temperature = float(data.get("temperature", 0.7)) | |
| if not messages: | |
| return JSONResponse({"error": "No messages provided"}, status_code=400) | |
| result = self.llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| ) | |
| return result | |
| async def health(): | |
| from pathlib import Path | |
| local_path = Path(MODEL_DIR) / MODEL_FILE | |
| return { | |
| "status": "ok" if local_path.exists() else "model_missing", | |
| "model_path": str(local_path), | |
| "model_size_gb": round(local_path.stat().st_size / 1e9, 2) if local_path.exists() else 0, | |
| } | |
| return web_app | |
| def fastapi_app(self): | |
| return self._build_app() | |