""" PregoPal × MiniCPM-o-4_5 — Modal 部署 (预编译 llama-cpp-python) 架构: FastAPI (ASGI) ←→ llama-cpp-python (CUDA via pre-built wheel) ↕ Modal Volume: GGUF models 用法: pip install modal # 安装 Modal CLI modal token new # 登录 Modal modal deploy modal_deploy.deploy # 部署 (~1 min) 测试: modal run modal_deploy.deploy::test_inference API: POST /v1/chat/completions — OpenAI 兼容 (支持 streaming) POST /v1/completions — Text completion POST /v1/embeddings — Embeddings POST /v1/vision — 多模态 (图片+文字) GET /health — 健康检查 GET /v1/models — 模型列表 """ import os import modal from modal import Image, App, Volume, asgi_app # ════════════════════════════════════════════════════════════════════ # 1. IMAGE — 预编译 CUDA wheel (不从头编译,构建 < 1 min) # ════════════════════════════════════════════════════════════════════ _image = ( Image.debian_slim(python_version="3.11") .pip_install("fastapi", "uvicorn[standard]", "httpx", "numpy", "Pillow") .pip_install( "llama-cpp-python", extra_index_url="https://ggml-org.github.io/llama-cpp-python/whl/cu121", ) .run_commands( "python -c 'import llama_cpp; print(\"llama-cpp-python OK\")'", ) ) # ════════════════════════════════════════════════════════════════════ # 2. CONSTANTS # ════════════════════════════════════════════════════════════════════ MODEL_DIR = "/models" MODEL_SUBDIR = f"{MODEL_DIR}/MiniCPM-o-4_5-gguf" MAIN_GGUF = "MiniCPM-o-4_5-Q4_K_M.gguf" VISION_MMPROJ = "vision/MiniCPM-o-4_5-vision-F16.gguf" model_volume = Volume.from_name("minicpm-o-4_5-models", create_if_missing=True) app = App("prego-pal-minicpm") def get_model_paths(base_dir: str) -> dict: """返回经验证的模型路径.""" main_path = os.path.join(base_dir, MAIN_GGUF) vision_path = os.path.join(base_dir, VISION_MMPROJ) paths = {"main": main_path, "vision": vision_path} for key, path in paths.items(): print(f"[PregoPal] {key}: {path} (exists={os.path.isfile(path)})") return paths # ════════════════════════════════════════════════════════════════════ # 3. ASGI APP — 多模态 API # ════════════════════════════════════════════════════════════════════ @app.function( image=_image, volumes={MODEL_DIR: model_volume}, gpu="T4", timeout=1200, scaledown_window=300, ) @modal.concurrent(max_inputs=10) @asgi_app() def serve(): import asyncio import json import logging import base64 from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse, JSONResponse from fastapi.middleware.cors import CORSMiddleware from llama_cpp import Llama logging.basicConfig(level=logging.INFO) logger = logging.getLogger("prego-pal") web_app = FastAPI(title="PregoPal MiniCPM-o-4_5 API") web_app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ── Model Loading ────────────────────────────────────────────── paths = get_model_paths(MODEL_SUBDIR) model_path = paths["main"] vision_path = paths["vision"] kwargs: dict = dict( model_path=model_path, n_gpu_layers=-1, n_ctx=8192, verbose=False, n_threads=os.cpu_count() or 4, ) if os.path.isfile(vision_path): kwargs["mmproj"] = vision_path logger.info("[PregoPal] [OK] Vision mmproj enabled") else: logger.warning(f"[PregoPal] [WARN] mmproj not found at {vision_path} — vision disabled") logger.info("[PregoPal] Loading model (30-90s)...") try: llm = Llama(**kwargs) logger.info("[PregoPal] [OK] Model loaded!") except Exception as e: logger.error(f"[PregoPal] [FAIL] Failed to load model: {e}") raise # ── Endpoints ────────────────────────────────────────────────── @web_app.post("/v1/chat/completions") async def chat_completions(request: Request): import traceback try: body = await request.json() except Exception as e: logger.error(f"[PregoPal] JSON parse error: {e}") return JSONResponse({"error": "Invalid JSON"}, status_code=400) stream = body.get("stream", False) messages = body.get("messages", []) max_tokens = body.get("max_tokens", 512) temperature = body.get("temperature", 0.7) top_p = body.get("top_p", 0.9) if stream: async def event_stream(): for chunk in llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=True, ): yield f"data: {json.dumps(chunk)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(event_stream(), media_type="text/event-stream") try: result = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=False, ) return JSONResponse(result) except Exception as e: logger.error(f"[PregoPal] Chat completion error: {e}\n{traceback.format_exc()}") return JSONResponse({"error": str(e)}, status_code=500) @web_app.post("/v1/completions") async def completions(request: Request): body = await request.json() prompt = body.get("prompt", "") max_tokens = body.get("max_tokens", 256) result = llm.create_completion( prompt=prompt, max_tokens=max_tokens, temperature=body.get("temperature", 0.7), stream=False, ) return JSONResponse(result) @web_app.post("/v1/embeddings") async def embeddings(request: Request): body = await request.json() result = llm.create_embedding( input=body.get("input", ""), model=body.get("model", "MiniCPM-o-4_5"), ) return JSONResponse(result) @web_app.post("/v1/vision") async def vision(request: Request): """多模态推理:接收 base64 图片.""" body = await request.json() messages = body.get("messages", []) max_tokens = body.get("max_tokens", 512) temperature = body.get("temperature", 0.7) if not os.path.isfile(vision_path): return JSONResponse( {"error": "Vision mmproj not loaded"}, status_code=400, ) result = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, stream=False, ) return JSONResponse(result) @web_app.get("/health") async def health(): try: vol_files = os.listdir(MODEL_SUBDIR) if os.path.isdir(MODEL_SUBDIR) else [] except Exception: vol_files = [] return { "status": "ok", "model": "MiniCPM-o-4_5", "cuda": True, "vision": os.path.isfile(vision_path), "volume_files": vol_files, } @web_app.get("/v1/models") async def list_models(): return { "object": "list", "data": [{ "id": "MiniCPM-o-4_5", "object": "model", "created": 1, "owned_by": "prego-pal", }], } @web_app.get("/") async def root(): return { "service": "PregoPal MiniCPM-o-4_5 API", "version": "2.0.0", "model": MAIN_GGUF, "endpoints": { "chat": "POST /v1/chat/completions", "completions": "POST /v1/completions", "embeddings": "POST /v1/embeddings", "vision": "POST /v1/vision (多模态)", "models": "GET /v1/models", "health": "GET /health", }, } return web_app # ════════════════════════════════════════════════════════════════════ # 4. MODEL UPLOAD 指引 # ════════════════════════════════════════════════════════════════════ @app.function( image=_image, volumes={MODEL_DIR: model_volume}, timeout=3600, ) def upload_models(): """打印上传模型指引.""" print("=" * 60) print("[UPLOAD] 上传模型至 Modal Volume 指引:") print() print(" modal volume put minicpm-o-4_5-models \\") print(" ./models/MiniCPM-o-4_5-gguf /MiniCPM-o-4_5-gguf") print() print(" # 验证:") print(" modal volume ls minicpm-o-4_5-models /MiniCPM-o-4_5-gguf") print("=" * 60) # 验证 volume 中现有文件 test_main = os.path.join(MODEL_SUBDIR, MAIN_GGUF) test_vision = os.path.join(MODEL_SUBDIR, VISION_MMPROJ) main_ok = os.path.isfile(test_main) vision_ok = os.path.isfile(test_vision) print(f"\n当前 Volume 状态:") print(f" {MAIN_GGUF}: [{'OK' if main_ok else 'FAIL'}] ({os.path.getsize(test_main) if main_ok else 'N/A'} bytes)") print(f" {VISION_MMPROJ}: [{'OK' if vision_ok else 'FAIL'}] ({os.path.getsize(test_vision) if vision_ok else 'N/A'} bytes)") if main_ok and vision_ok: print(f"\n[OK] 模型就绪,可以执行 deploy!") else: print(f"\n[FAIL] 模型上传不完整,请重新上传") # ════════════════════════════════════════════════════════════════════ # 5. TEST INFERENCE # ════════════════════════════════════════════════════════════════════ @app.function( image=_image, volumes={MODEL_DIR: model_volume}, gpu="T4", timeout=600, ) def test_inference(): """在 Modal 上测试推理.""" import time import json from llama_cpp import Llama print("[PregoPal] ========== TEST INFERENCE ==========") paths = get_model_paths(MODEL_SUBDIR) main_path = paths["main"] vision_path = paths["vision"] if not os.path.isfile(main_path): print(f"[PregoPal] [FAIL] Model not found at {main_path}") return t0 = time.time() kwargs = dict( model_path=main_path, n_gpu_layers=-1, n_ctx=4096, verbose=False, ) if os.path.isfile(vision_path): kwargs["mmproj"] = vision_path print("[PregoPal] Loading model...") llm = Llama(**kwargs) load_time = time.time() - t0 print(f"[PregoPal] [OK] Model loaded in {load_time:.1f}s") print(f" Main: {os.path.getsize(main_path) / 1024**3:.1f} GiB") if os.path.isfile(vision_path): print(f" Vision: {os.path.getsize(vision_path) / 1024**3:.1f} GiB") # Test 1: 中文 print("\n[Test 1] 中文提问...") t0 = time.time() result = llm.create_chat_completion( messages=[{"role": "user", "content": "用中文说你好,不超过10个字"}], max_tokens=30, temperature=0.1, ) elapsed = time.time() - t0 content = result.get("choices", [{}])[0].get("message", {}).get("content", "") print(f"Response ({elapsed:.1f}s): {content}") # Test 2: 英文 print("\n[Test 2] 英文提问...") t0 = time.time() result = llm.create_chat_completion( messages=[{"role": "user", "content": "What is the capital of France? Answer in 5 words."}], max_tokens=30, temperature=0.1, ) elapsed = time.time() - t0 content = result.get("choices", [{}])[0].get("message", {}).get("content", "") print(f"Response ({elapsed:.1f}s): {content}") print(f"\n{'='*50}") print(f"[OK] Test complete! Loading: {load_time:.1f}s") print(f"{'='*50}")