PregoPal / modal_deploy /deploy.py
J.B-Lin
modal: ้™็บง GPU A100 -> T4 ่Š‚็œๆˆๆœฌ
2d207a7
Raw
History Blame Contribute Delete
13.6 kB
"""
PregoPal ร— MiniCPM-o-4_5 โ€” Modal ้ƒจ็ฝฒ (้ข„็ผ–่ฏ‘ llama-cpp-python)
ๆžถๆž„:
FastAPI (ASGI) โ†โ†’ llama-cpp-python (CUDA via pre-built wheel)
โ†•
Modal Volume: GGUF models
็”จๆณ•:
pip install modal # ๅฎ‰่ฃ… Modal CLI
modal token new # ็™ปๅฝ• Modal
modal deploy modal_deploy.deploy # ้ƒจ็ฝฒ (~1 min)
ๆต‹่ฏ•:
modal run modal_deploy.deploy::test_inference
API:
POST /v1/chat/completions โ€” OpenAI ๅ…ผๅฎน (ๆ”ฏๆŒ streaming)
POST /v1/completions โ€” Text completion
POST /v1/embeddings โ€” Embeddings
POST /v1/vision โ€” ๅคšๆจกๆ€ (ๅ›พ็‰‡+ๆ–‡ๅญ—)
GET /health โ€” ๅฅๅบทๆฃ€ๆŸฅ
GET /v1/models โ€” ๆจกๅž‹ๅˆ—่กจ
"""
import os
import modal
from modal import Image, App, Volume, asgi_app
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# 1. IMAGE โ€” ้ข„็ผ–่ฏ‘ CUDA wheel (ไธไปŽๅคด็ผ–่ฏ‘๏ผŒๆž„ๅปบ < 1 min)
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
_image = (
Image.debian_slim(python_version="3.11")
.pip_install("fastapi", "uvicorn[standard]", "httpx", "numpy", "Pillow")
.pip_install(
"llama-cpp-python",
extra_index_url="https://ggml-org.github.io/llama-cpp-python/whl/cu121",
)
.run_commands(
"python -c 'import llama_cpp; print(\"llama-cpp-python OK\")'",
)
)
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# 2. CONSTANTS
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
MODEL_DIR = "/models"
MODEL_SUBDIR = f"{MODEL_DIR}/MiniCPM-o-4_5-gguf"
MAIN_GGUF = "MiniCPM-o-4_5-Q4_K_M.gguf"
VISION_MMPROJ = "vision/MiniCPM-o-4_5-vision-F16.gguf"
model_volume = Volume.from_name("minicpm-o-4_5-models", create_if_missing=True)
app = App("prego-pal-minicpm")
def get_model_paths(base_dir: str) -> dict:
"""่ฟ”ๅ›ž็ป้ชŒ่ฏ็š„ๆจกๅž‹่ทฏๅพ„."""
main_path = os.path.join(base_dir, MAIN_GGUF)
vision_path = os.path.join(base_dir, VISION_MMPROJ)
paths = {"main": main_path, "vision": vision_path}
for key, path in paths.items():
print(f"[PregoPal] {key}: {path} (exists={os.path.isfile(path)})")
return paths
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# 3. ASGI APP โ€” ๅคšๆจกๆ€ API
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
@app.function(
image=_image,
volumes={MODEL_DIR: model_volume},
gpu="T4",
timeout=1200,
scaledown_window=300,
)
@modal.concurrent(max_inputs=10)
@asgi_app()
def serve():
import asyncio
import json
import logging
import base64
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from llama_cpp import Llama
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("prego-pal")
web_app = FastAPI(title="PregoPal MiniCPM-o-4_5 API")
web_app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# โ”€โ”€ Model Loading โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
paths = get_model_paths(MODEL_SUBDIR)
model_path = paths["main"]
vision_path = paths["vision"]
kwargs: dict = dict(
model_path=model_path,
n_gpu_layers=-1,
n_ctx=8192,
verbose=False,
n_threads=os.cpu_count() or 4,
)
if os.path.isfile(vision_path):
kwargs["mmproj"] = vision_path
logger.info("[PregoPal] [OK] Vision mmproj enabled")
else:
logger.warning(f"[PregoPal] [WARN] mmproj not found at {vision_path} โ€” vision disabled")
logger.info("[PregoPal] Loading model (30-90s)...")
try:
llm = Llama(**kwargs)
logger.info("[PregoPal] [OK] Model loaded!")
except Exception as e:
logger.error(f"[PregoPal] [FAIL] Failed to load model: {e}")
raise
# โ”€โ”€ Endpoints โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
@web_app.post("/v1/chat/completions")
async def chat_completions(request: Request):
import traceback
try:
body = await request.json()
except Exception as e:
logger.error(f"[PregoPal] JSON parse error: {e}")
return JSONResponse({"error": "Invalid JSON"}, status_code=400)
stream = body.get("stream", False)
messages = body.get("messages", [])
max_tokens = body.get("max_tokens", 512)
temperature = body.get("temperature", 0.7)
top_p = body.get("top_p", 0.9)
if stream:
async def event_stream():
for chunk in llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=True,
):
yield f"data: {json.dumps(chunk)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(event_stream(), media_type="text/event-stream")
try:
result = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=False,
)
return JSONResponse(result)
except Exception as e:
logger.error(f"[PregoPal] Chat completion error: {e}\n{traceback.format_exc()}")
return JSONResponse({"error": str(e)}, status_code=500)
@web_app.post("/v1/completions")
async def completions(request: Request):
body = await request.json()
prompt = body.get("prompt", "")
max_tokens = body.get("max_tokens", 256)
result = llm.create_completion(
prompt=prompt,
max_tokens=max_tokens,
temperature=body.get("temperature", 0.7),
stream=False,
)
return JSONResponse(result)
@web_app.post("/v1/embeddings")
async def embeddings(request: Request):
body = await request.json()
result = llm.create_embedding(
input=body.get("input", ""),
model=body.get("model", "MiniCPM-o-4_5"),
)
return JSONResponse(result)
@web_app.post("/v1/vision")
async def vision(request: Request):
"""ๅคšๆจกๆ€ๆŽจ็†๏ผšๆŽฅๆ”ถ base64 ๅ›พ็‰‡."""
body = await request.json()
messages = body.get("messages", [])
max_tokens = body.get("max_tokens", 512)
temperature = body.get("temperature", 0.7)
if not os.path.isfile(vision_path):
return JSONResponse(
{"error": "Vision mmproj not loaded"},
status_code=400,
)
result = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
stream=False,
)
return JSONResponse(result)
@web_app.get("/health")
async def health():
try:
vol_files = os.listdir(MODEL_SUBDIR) if os.path.isdir(MODEL_SUBDIR) else []
except Exception:
vol_files = []
return {
"status": "ok",
"model": "MiniCPM-o-4_5",
"cuda": True,
"vision": os.path.isfile(vision_path),
"volume_files": vol_files,
}
@web_app.get("/v1/models")
async def list_models():
return {
"object": "list",
"data": [{
"id": "MiniCPM-o-4_5",
"object": "model",
"created": 1,
"owned_by": "prego-pal",
}],
}
@web_app.get("/")
async def root():
return {
"service": "PregoPal MiniCPM-o-4_5 API",
"version": "2.0.0",
"model": MAIN_GGUF,
"endpoints": {
"chat": "POST /v1/chat/completions",
"completions": "POST /v1/completions",
"embeddings": "POST /v1/embeddings",
"vision": "POST /v1/vision (ๅคšๆจกๆ€)",
"models": "GET /v1/models",
"health": "GET /health",
},
}
return web_app
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# 4. MODEL UPLOAD ๆŒ‡ๅผ•
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
@app.function(
image=_image,
volumes={MODEL_DIR: model_volume},
timeout=3600,
)
def upload_models():
"""ๆ‰“ๅฐไธŠไผ ๆจกๅž‹ๆŒ‡ๅผ•."""
print("=" * 60)
print("[UPLOAD] ไธŠไผ ๆจกๅž‹่‡ณ Modal Volume ๆŒ‡ๅผ•:")
print()
print(" modal volume put minicpm-o-4_5-models \\")
print(" ./models/MiniCPM-o-4_5-gguf /MiniCPM-o-4_5-gguf")
print()
print(" # ้ชŒ่ฏ:")
print(" modal volume ls minicpm-o-4_5-models /MiniCPM-o-4_5-gguf")
print("=" * 60)
# ้ชŒ่ฏ volume ไธญ็Žฐๆœ‰ๆ–‡ไปถ
test_main = os.path.join(MODEL_SUBDIR, MAIN_GGUF)
test_vision = os.path.join(MODEL_SUBDIR, VISION_MMPROJ)
main_ok = os.path.isfile(test_main)
vision_ok = os.path.isfile(test_vision)
print(f"\nๅฝ“ๅ‰ Volume ็Šถๆ€:")
print(f" {MAIN_GGUF}: [{'OK' if main_ok else 'FAIL'}] ({os.path.getsize(test_main) if main_ok else 'N/A'} bytes)")
print(f" {VISION_MMPROJ}: [{'OK' if vision_ok else 'FAIL'}] ({os.path.getsize(test_vision) if vision_ok else 'N/A'} bytes)")
if main_ok and vision_ok:
print(f"\n[OK] ๆจกๅž‹ๅฐฑ็ปช๏ผŒๅฏไปฅๆ‰ง่กŒ deploy!")
else:
print(f"\n[FAIL] ๆจกๅž‹ไธŠไผ ไธๅฎŒๆ•ด๏ผŒ่ฏท้‡ๆ–ฐไธŠไผ ")
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# 5. TEST INFERENCE
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
@app.function(
image=_image,
volumes={MODEL_DIR: model_volume},
gpu="T4",
timeout=600,
)
def test_inference():
"""ๅœจ Modal ไธŠๆต‹่ฏ•ๆŽจ็†."""
import time
import json
from llama_cpp import Llama
print("[PregoPal] ========== TEST INFERENCE ==========")
paths = get_model_paths(MODEL_SUBDIR)
main_path = paths["main"]
vision_path = paths["vision"]
if not os.path.isfile(main_path):
print(f"[PregoPal] [FAIL] Model not found at {main_path}")
return
t0 = time.time()
kwargs = dict(
model_path=main_path,
n_gpu_layers=-1,
n_ctx=4096,
verbose=False,
)
if os.path.isfile(vision_path):
kwargs["mmproj"] = vision_path
print("[PregoPal] Loading model...")
llm = Llama(**kwargs)
load_time = time.time() - t0
print(f"[PregoPal] [OK] Model loaded in {load_time:.1f}s")
print(f" Main: {os.path.getsize(main_path) / 1024**3:.1f} GiB")
if os.path.isfile(vision_path):
print(f" Vision: {os.path.getsize(vision_path) / 1024**3:.1f} GiB")
# Test 1: ไธญๆ–‡
print("\n[Test 1] ไธญๆ–‡ๆ้—ฎ...")
t0 = time.time()
result = llm.create_chat_completion(
messages=[{"role": "user", "content": "็”จไธญๆ–‡่ฏดไฝ ๅฅฝ๏ผŒไธ่ถ…่ฟ‡10ไธชๅญ—"}],
max_tokens=30,
temperature=0.1,
)
elapsed = time.time() - t0
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
print(f"Response ({elapsed:.1f}s): {content}")
# Test 2: ่‹ฑๆ–‡
print("\n[Test 2] ่‹ฑๆ–‡ๆ้—ฎ...")
t0 = time.time()
result = llm.create_chat_completion(
messages=[{"role": "user", "content": "What is the capital of France? Answer in 5 words."}],
max_tokens=30,
temperature=0.1,
)
elapsed = time.time() - t0
content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
print(f"Response ({elapsed:.1f}s): {content}")
print(f"\n{'='*50}")
print(f"[OK] Test complete! Loading: {load_time:.1f}s")
print(f"{'='*50}")