"""
PregoPal × MiniCPM-o-4_5 — Modal 部署 (预编译 llama-cpp-python)

架构:
  FastAPI (ASGI) ←→ llama-cpp-python (CUDA via pre-built wheel)
                         ↕
                  Modal Volume: GGUF models

用法:
    pip install modal      # 安装 Modal CLI
    modal token new        # 登录 Modal
    modal deploy modal_deploy.deploy    # 部署 (~1 min)

测试:
    modal run modal_deploy.deploy::test_inference

API:
    POST /v1/chat/completions   — OpenAI 兼容 (支持 streaming)
    POST /v1/completions        — Text completion
    POST /v1/embeddings         — Embeddings
    POST /v1/vision             — 多模态 (图片+文字)
    GET  /health                — 健康检查
    GET  /v1/models             — 模型列表
"""

import os
import modal
from modal import Image, App, Volume, asgi_app

# ════════════════════════════════════════════════════════════════════
# 1. IMAGE — 预编译 CUDA wheel (不从头编译，构建 < 1 min)
# ════════════════════════════════════════════════════════════════════

_image = (
    Image.debian_slim(python_version="3.11")
    .pip_install("fastapi", "uvicorn[standard]", "httpx", "numpy", "Pillow")
    .pip_install(
        "llama-cpp-python",
        extra_index_url="https://ggml-org.github.io/llama-cpp-python/whl/cu121",
    )
    .run_commands(
        "python -c 'import llama_cpp; print(\"llama-cpp-python OK\")'",
    )
)

# ════════════════════════════════════════════════════════════════════
# 2. CONSTANTS
# ════════════════════════════════════════════════════════════════════

MODEL_DIR = "/models"
MODEL_SUBDIR = f"{MODEL_DIR}/MiniCPM-o-4_5-gguf"
MAIN_GGUF = "MiniCPM-o-4_5-Q4_K_M.gguf"
VISION_MMPROJ = "vision/MiniCPM-o-4_5-vision-F16.gguf"

model_volume = Volume.from_name("minicpm-o-4_5-models", create_if_missing=True)
app = App("prego-pal-minicpm")


def get_model_paths(base_dir: str) -> dict:
    """返回经验证的模型路径."""
    main_path = os.path.join(base_dir, MAIN_GGUF)
    vision_path = os.path.join(base_dir, VISION_MMPROJ)
    paths = {"main": main_path, "vision": vision_path}
    for key, path in paths.items():
        print(f"[PregoPal] {key}: {path} (exists={os.path.isfile(path)})")
    return paths


# ════════════════════════════════════════════════════════════════════
# 3. ASGI APP — 多模态 API
# ════════════════════════════════════════════════════════════════════

@app.function(
    image=_image,
    volumes={MODEL_DIR: model_volume},
    gpu="T4",
    timeout=1200,
    scaledown_window=300,
)
@modal.concurrent(max_inputs=10)
@asgi_app()
def serve():
    import asyncio
    import json
    import logging
    import base64
    from fastapi import FastAPI, Request
    from fastapi.responses import StreamingResponse, JSONResponse
    from fastapi.middleware.cors import CORSMiddleware
    from llama_cpp import Llama

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("prego-pal")

    web_app = FastAPI(title="PregoPal MiniCPM-o-4_5 API")
    web_app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )

    # ── Model Loading ──────────────────────────────────────────────
    paths = get_model_paths(MODEL_SUBDIR)
    model_path = paths["main"]
    vision_path = paths["vision"]

    kwargs: dict = dict(
        model_path=model_path,
        n_gpu_layers=-1,
        n_ctx=8192,
        verbose=False,
        n_threads=os.cpu_count() or 4,
    )
    if os.path.isfile(vision_path):
        kwargs["mmproj"] = vision_path
        logger.info("[PregoPal] [OK] Vision mmproj enabled")
    else:
        logger.warning(f"[PregoPal] [WARN] mmproj not found at {vision_path} — vision disabled")

    logger.info("[PregoPal] Loading model (30-90s)...")
    try:
        llm = Llama(**kwargs)
        logger.info("[PregoPal] [OK] Model loaded!")
    except Exception as e:
        logger.error(f"[PregoPal] [FAIL] Failed to load model: {e}")
        raise

    # ── Endpoints ──────────────────────────────────────────────────

    @web_app.post("/v1/chat/completions")
    async def chat_completions(request: Request):
        import traceback
        try:
            body = await request.json()
        except Exception as e:
            logger.error(f"[PregoPal] JSON parse error: {e}")
            return JSONResponse({"error": "Invalid JSON"}, status_code=400)
        stream = body.get("stream", False)
        messages = body.get("messages", [])
        max_tokens = body.get("max_tokens", 512)
        temperature = body.get("temperature", 0.7)
        top_p = body.get("top_p", 0.9)

        if stream:
            async def event_stream():
                for chunk in llm.create_chat_completion(
                    messages=messages,
                    max_tokens=max_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    stream=True,
                ):
                    yield f"data: {json.dumps(chunk)}\n\n"
                yield "data: [DONE]\n\n"
            return StreamingResponse(event_stream(), media_type="text/event-stream")

        try:
            result = llm.create_chat_completion(
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                stream=False,
            )
            return JSONResponse(result)
        except Exception as e:
            logger.error(f"[PregoPal] Chat completion error: {e}\n{traceback.format_exc()}")
            return JSONResponse({"error": str(e)}, status_code=500)

    @web_app.post("/v1/completions")
    async def completions(request: Request):
        body = await request.json()
        prompt = body.get("prompt", "")
        max_tokens = body.get("max_tokens", 256)

        result = llm.create_completion(
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=body.get("temperature", 0.7),
            stream=False,
        )
        return JSONResponse(result)

    @web_app.post("/v1/embeddings")
    async def embeddings(request: Request):
        body = await request.json()
        result = llm.create_embedding(
            input=body.get("input", ""),
            model=body.get("model", "MiniCPM-o-4_5"),
        )
        return JSONResponse(result)

    @web_app.post("/v1/vision")
    async def vision(request: Request):
        """多模态推理：接收 base64 图片."""
        body = await request.json()
        messages = body.get("messages", [])
        max_tokens = body.get("max_tokens", 512)
        temperature = body.get("temperature", 0.7)

        if not os.path.isfile(vision_path):
            return JSONResponse(
                {"error": "Vision mmproj not loaded"},
                status_code=400,
            )

        result = llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            stream=False,
        )
        return JSONResponse(result)

    @web_app.get("/health")
    async def health():
        try:
            vol_files = os.listdir(MODEL_SUBDIR) if os.path.isdir(MODEL_SUBDIR) else []
        except Exception:
            vol_files = []
        return {
            "status": "ok",
            "model": "MiniCPM-o-4_5",
            "cuda": True,
            "vision": os.path.isfile(vision_path),
            "volume_files": vol_files,
        }

    @web_app.get("/v1/models")
    async def list_models():
        return {
            "object": "list",
            "data": [{
                "id": "MiniCPM-o-4_5",
                "object": "model",
                "created": 1,
                "owned_by": "prego-pal",
            }],
        }

    @web_app.get("/")
    async def root():
        return {
            "service": "PregoPal MiniCPM-o-4_5 API",
            "version": "2.0.0",
            "model": MAIN_GGUF,
            "endpoints": {
                "chat": "POST /v1/chat/completions",
                "completions": "POST /v1/completions",
                "embeddings": "POST /v1/embeddings",
                "vision": "POST /v1/vision (多模态)",
                "models": "GET /v1/models",
                "health": "GET /health",
            },
        }

    return web_app


# ════════════════════════════════════════════════════════════════════
# 4. MODEL UPLOAD 指引
# ════════════════════════════════════════════════════════════════════

@app.function(
    image=_image,
    volumes={MODEL_DIR: model_volume},
    timeout=3600,
)
def upload_models():
    """打印上传模型指引."""
    print("=" * 60)
    print("[UPLOAD] 上传模型至 Modal Volume 指引:")
    print()
    print("  modal volume put minicpm-o-4_5-models \\")
    print("      ./models/MiniCPM-o-4_5-gguf /MiniCPM-o-4_5-gguf")
    print()
    print("  # 验证:")
    print("  modal volume ls minicpm-o-4_5-models /MiniCPM-o-4_5-gguf")
    print("=" * 60)

    # 验证 volume 中现有文件
    test_main = os.path.join(MODEL_SUBDIR, MAIN_GGUF)
    test_vision = os.path.join(MODEL_SUBDIR, VISION_MMPROJ)
    main_ok = os.path.isfile(test_main)
    vision_ok = os.path.isfile(test_vision)

    print(f"\n当前 Volume 状态:")
    print(f"  {MAIN_GGUF}: [{'OK' if main_ok else 'FAIL'}] ({os.path.getsize(test_main) if main_ok else 'N/A'} bytes)")
    print(f"  {VISION_MMPROJ}: [{'OK' if vision_ok else 'FAIL'}] ({os.path.getsize(test_vision) if vision_ok else 'N/A'} bytes)")

    if main_ok and vision_ok:
        print(f"\n[OK] 模型就绪，可以执行 deploy!")
    else:
        print(f"\n[FAIL] 模型上传不完整，请重新上传")


# ════════════════════════════════════════════════════════════════════
# 5. TEST INFERENCE
# ════════════════════════════════════════════════════════════════════

@app.function(
    image=_image,
    volumes={MODEL_DIR: model_volume},
    gpu="T4",
    timeout=600,
)
def test_inference():
    """在 Modal 上测试推理."""
    import time
    import json
    from llama_cpp import Llama

    print("[PregoPal] ========== TEST INFERENCE ==========")

    paths = get_model_paths(MODEL_SUBDIR)
    main_path = paths["main"]
    vision_path = paths["vision"]

    if not os.path.isfile(main_path):
        print(f"[PregoPal] [FAIL] Model not found at {main_path}")
        return

    t0 = time.time()
    kwargs = dict(
        model_path=main_path,
        n_gpu_layers=-1,
        n_ctx=4096,
        verbose=False,
    )
    if os.path.isfile(vision_path):
        kwargs["mmproj"] = vision_path

    print("[PregoPal] Loading model...")
    llm = Llama(**kwargs)
    load_time = time.time() - t0
    print(f"[PregoPal] [OK] Model loaded in {load_time:.1f}s")
    print(f"   Main: {os.path.getsize(main_path) / 1024**3:.1f} GiB")
    if os.path.isfile(vision_path):
        print(f"   Vision: {os.path.getsize(vision_path) / 1024**3:.1f} GiB")

    # Test 1: 中文
    print("\n[Test 1] 中文提问...")
    t0 = time.time()
    result = llm.create_chat_completion(
        messages=[{"role": "user", "content": "用中文说你好，不超过10个字"}],
        max_tokens=30,
        temperature=0.1,
    )
    elapsed = time.time() - t0
    content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
    print(f"Response ({elapsed:.1f}s): {content}")

    # Test 2: 英文
    print("\n[Test 2] 英文提问...")
    t0 = time.time()
    result = llm.create_chat_completion(
        messages=[{"role": "user", "content": "What is the capital of France? Answer in 5 words."}],
        max_tokens=30,
        temperature=0.1,
    )
    elapsed = time.time() - t0
    content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
    print(f"Response ({elapsed:.1f}s): {content}")

    print(f"\n{'='*50}")
    print(f"[OK] Test complete! Loading: {load_time:.1f}s")
    print(f"{'='*50}")