Spaces:
Runtime error
Runtime error
| """ | |
| PregoPal ร MiniCPM-o-4_5 โ Modal ้จ็ฝฒ (้ข็ผ่ฏ llama-cpp-python) | |
| ๆถๆ: | |
| FastAPI (ASGI) โโ llama-cpp-python (CUDA via pre-built wheel) | |
| โ | |
| Modal Volume: GGUF models | |
| ็จๆณ: | |
| pip install modal # ๅฎ่ฃ Modal CLI | |
| modal token new # ็ปๅฝ Modal | |
| modal deploy modal_deploy.deploy # ้จ็ฝฒ (~1 min) | |
| ๆต่ฏ: | |
| modal run modal_deploy.deploy::test_inference | |
| API: | |
| POST /v1/chat/completions โ OpenAI ๅ ผๅฎน (ๆฏๆ streaming) | |
| POST /v1/completions โ Text completion | |
| POST /v1/embeddings โ Embeddings | |
| POST /v1/vision โ ๅคๆจกๆ (ๅพ็+ๆๅญ) | |
| GET /health โ ๅฅๅบทๆฃๆฅ | |
| GET /v1/models โ ๆจกๅๅ่กจ | |
| """ | |
| import os | |
| import modal | |
| from modal import Image, App, Volume, asgi_app | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 1. IMAGE โ ้ข็ผ่ฏ CUDA wheel (ไธไปๅคด็ผ่ฏ๏ผๆๅปบ < 1 min) | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _image = ( | |
| Image.debian_slim(python_version="3.11") | |
| .pip_install("fastapi", "uvicorn[standard]", "httpx", "numpy", "Pillow") | |
| .pip_install( | |
| "llama-cpp-python", | |
| extra_index_url="https://ggml-org.github.io/llama-cpp-python/whl/cu121", | |
| ) | |
| .run_commands( | |
| "python -c 'import llama_cpp; print(\"llama-cpp-python OK\")'", | |
| ) | |
| ) | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 2. CONSTANTS | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| MODEL_DIR = "/models" | |
| MODEL_SUBDIR = f"{MODEL_DIR}/MiniCPM-o-4_5-gguf" | |
| MAIN_GGUF = "MiniCPM-o-4_5-Q4_K_M.gguf" | |
| VISION_MMPROJ = "vision/MiniCPM-o-4_5-vision-F16.gguf" | |
| model_volume = Volume.from_name("minicpm-o-4_5-models", create_if_missing=True) | |
| app = App("prego-pal-minicpm") | |
| def get_model_paths(base_dir: str) -> dict: | |
| """่ฟๅ็ป้ช่ฏ็ๆจกๅ่ทฏๅพ.""" | |
| main_path = os.path.join(base_dir, MAIN_GGUF) | |
| vision_path = os.path.join(base_dir, VISION_MMPROJ) | |
| paths = {"main": main_path, "vision": vision_path} | |
| for key, path in paths.items(): | |
| print(f"[PregoPal] {key}: {path} (exists={os.path.isfile(path)})") | |
| return paths | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 3. ASGI APP โ ๅคๆจกๆ API | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def serve(): | |
| import asyncio | |
| import json | |
| import logging | |
| import base64 | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import StreamingResponse, JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from llama_cpp import Llama | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("prego-pal") | |
| web_app = FastAPI(title="PregoPal MiniCPM-o-4_5 API") | |
| web_app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # โโ Model Loading โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| paths = get_model_paths(MODEL_SUBDIR) | |
| model_path = paths["main"] | |
| vision_path = paths["vision"] | |
| kwargs: dict = dict( | |
| model_path=model_path, | |
| n_gpu_layers=-1, | |
| n_ctx=8192, | |
| verbose=False, | |
| n_threads=os.cpu_count() or 4, | |
| ) | |
| if os.path.isfile(vision_path): | |
| kwargs["mmproj"] = vision_path | |
| logger.info("[PregoPal] [OK] Vision mmproj enabled") | |
| else: | |
| logger.warning(f"[PregoPal] [WARN] mmproj not found at {vision_path} โ vision disabled") | |
| logger.info("[PregoPal] Loading model (30-90s)...") | |
| try: | |
| llm = Llama(**kwargs) | |
| logger.info("[PregoPal] [OK] Model loaded!") | |
| except Exception as e: | |
| logger.error(f"[PregoPal] [FAIL] Failed to load model: {e}") | |
| raise | |
| # โโ Endpoints โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| async def chat_completions(request: Request): | |
| import traceback | |
| try: | |
| body = await request.json() | |
| except Exception as e: | |
| logger.error(f"[PregoPal] JSON parse error: {e}") | |
| return JSONResponse({"error": "Invalid JSON"}, status_code=400) | |
| stream = body.get("stream", False) | |
| messages = body.get("messages", []) | |
| max_tokens = body.get("max_tokens", 512) | |
| temperature = body.get("temperature", 0.7) | |
| top_p = body.get("top_p", 0.9) | |
| if stream: | |
| async def event_stream(): | |
| for chunk in llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stream=True, | |
| ): | |
| yield f"data: {json.dumps(chunk)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(event_stream(), media_type="text/event-stream") | |
| try: | |
| result = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stream=False, | |
| ) | |
| return JSONResponse(result) | |
| except Exception as e: | |
| logger.error(f"[PregoPal] Chat completion error: {e}\n{traceback.format_exc()}") | |
| return JSONResponse({"error": str(e)}, status_code=500) | |
| async def completions(request: Request): | |
| body = await request.json() | |
| prompt = body.get("prompt", "") | |
| max_tokens = body.get("max_tokens", 256) | |
| result = llm.create_completion( | |
| prompt=prompt, | |
| max_tokens=max_tokens, | |
| temperature=body.get("temperature", 0.7), | |
| stream=False, | |
| ) | |
| return JSONResponse(result) | |
| async def embeddings(request: Request): | |
| body = await request.json() | |
| result = llm.create_embedding( | |
| input=body.get("input", ""), | |
| model=body.get("model", "MiniCPM-o-4_5"), | |
| ) | |
| return JSONResponse(result) | |
| async def vision(request: Request): | |
| """ๅคๆจกๆๆจ็๏ผๆฅๆถ base64 ๅพ็.""" | |
| body = await request.json() | |
| messages = body.get("messages", []) | |
| max_tokens = body.get("max_tokens", 512) | |
| temperature = body.get("temperature", 0.7) | |
| if not os.path.isfile(vision_path): | |
| return JSONResponse( | |
| {"error": "Vision mmproj not loaded"}, | |
| status_code=400, | |
| ) | |
| result = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| stream=False, | |
| ) | |
| return JSONResponse(result) | |
| async def health(): | |
| try: | |
| vol_files = os.listdir(MODEL_SUBDIR) if os.path.isdir(MODEL_SUBDIR) else [] | |
| except Exception: | |
| vol_files = [] | |
| return { | |
| "status": "ok", | |
| "model": "MiniCPM-o-4_5", | |
| "cuda": True, | |
| "vision": os.path.isfile(vision_path), | |
| "volume_files": vol_files, | |
| } | |
| async def list_models(): | |
| return { | |
| "object": "list", | |
| "data": [{ | |
| "id": "MiniCPM-o-4_5", | |
| "object": "model", | |
| "created": 1, | |
| "owned_by": "prego-pal", | |
| }], | |
| } | |
| async def root(): | |
| return { | |
| "service": "PregoPal MiniCPM-o-4_5 API", | |
| "version": "2.0.0", | |
| "model": MAIN_GGUF, | |
| "endpoints": { | |
| "chat": "POST /v1/chat/completions", | |
| "completions": "POST /v1/completions", | |
| "embeddings": "POST /v1/embeddings", | |
| "vision": "POST /v1/vision (ๅคๆจกๆ)", | |
| "models": "GET /v1/models", | |
| "health": "GET /health", | |
| }, | |
| } | |
| return web_app | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 4. MODEL UPLOAD ๆๅผ | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def upload_models(): | |
| """ๆๅฐไธไผ ๆจกๅๆๅผ.""" | |
| print("=" * 60) | |
| print("[UPLOAD] ไธไผ ๆจกๅ่ณ Modal Volume ๆๅผ:") | |
| print() | |
| print(" modal volume put minicpm-o-4_5-models \\") | |
| print(" ./models/MiniCPM-o-4_5-gguf /MiniCPM-o-4_5-gguf") | |
| print() | |
| print(" # ้ช่ฏ:") | |
| print(" modal volume ls minicpm-o-4_5-models /MiniCPM-o-4_5-gguf") | |
| print("=" * 60) | |
| # ้ช่ฏ volume ไธญ็ฐๆๆไปถ | |
| test_main = os.path.join(MODEL_SUBDIR, MAIN_GGUF) | |
| test_vision = os.path.join(MODEL_SUBDIR, VISION_MMPROJ) | |
| main_ok = os.path.isfile(test_main) | |
| vision_ok = os.path.isfile(test_vision) | |
| print(f"\nๅฝๅ Volume ็ถๆ:") | |
| print(f" {MAIN_GGUF}: [{'OK' if main_ok else 'FAIL'}] ({os.path.getsize(test_main) if main_ok else 'N/A'} bytes)") | |
| print(f" {VISION_MMPROJ}: [{'OK' if vision_ok else 'FAIL'}] ({os.path.getsize(test_vision) if vision_ok else 'N/A'} bytes)") | |
| if main_ok and vision_ok: | |
| print(f"\n[OK] ๆจกๅๅฐฑ็ปช๏ผๅฏไปฅๆง่ก deploy!") | |
| else: | |
| print(f"\n[FAIL] ๆจกๅไธไผ ไธๅฎๆด๏ผ่ฏท้ๆฐไธไผ ") | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # 5. TEST INFERENCE | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def test_inference(): | |
| """ๅจ Modal ไธๆต่ฏๆจ็.""" | |
| import time | |
| import json | |
| from llama_cpp import Llama | |
| print("[PregoPal] ========== TEST INFERENCE ==========") | |
| paths = get_model_paths(MODEL_SUBDIR) | |
| main_path = paths["main"] | |
| vision_path = paths["vision"] | |
| if not os.path.isfile(main_path): | |
| print(f"[PregoPal] [FAIL] Model not found at {main_path}") | |
| return | |
| t0 = time.time() | |
| kwargs = dict( | |
| model_path=main_path, | |
| n_gpu_layers=-1, | |
| n_ctx=4096, | |
| verbose=False, | |
| ) | |
| if os.path.isfile(vision_path): | |
| kwargs["mmproj"] = vision_path | |
| print("[PregoPal] Loading model...") | |
| llm = Llama(**kwargs) | |
| load_time = time.time() - t0 | |
| print(f"[PregoPal] [OK] Model loaded in {load_time:.1f}s") | |
| print(f" Main: {os.path.getsize(main_path) / 1024**3:.1f} GiB") | |
| if os.path.isfile(vision_path): | |
| print(f" Vision: {os.path.getsize(vision_path) / 1024**3:.1f} GiB") | |
| # Test 1: ไธญๆ | |
| print("\n[Test 1] ไธญๆๆ้ฎ...") | |
| t0 = time.time() | |
| result = llm.create_chat_completion( | |
| messages=[{"role": "user", "content": "็จไธญๆ่ฏดไฝ ๅฅฝ๏ผไธ่ถ ่ฟ10ไธชๅญ"}], | |
| max_tokens=30, | |
| temperature=0.1, | |
| ) | |
| elapsed = time.time() - t0 | |
| content = result.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| print(f"Response ({elapsed:.1f}s): {content}") | |
| # Test 2: ่ฑๆ | |
| print("\n[Test 2] ่ฑๆๆ้ฎ...") | |
| t0 = time.time() | |
| result = llm.create_chat_completion( | |
| messages=[{"role": "user", "content": "What is the capital of France? Answer in 5 words."}], | |
| max_tokens=30, | |
| temperature=0.1, | |
| ) | |
| elapsed = time.time() - t0 | |
| content = result.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| print(f"Response ({elapsed:.1f}s): {content}") | |
| print(f"\n{'='*50}") | |
| print(f"[OK] Test complete! Loading: {load_time:.1f}s") | |
| print(f"{'='*50}") |