from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from typing import List, Optional
import os

from llama_cpp import Llama

# Load model
MODEL_PATH = "/app/models/Qwen2.5-3B-Instruct-Q4_K_M.gguf"
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model not found at {MODEL_PATH}")

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,
    n_threads=4,
    chat_format="chatml",
    hf_pretrained_model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
    verbose=False,
)

app = FastAPI(title="Qwen2.5-3B API", version="0.1.0")

class Message(BaseModel):
    role: str = Field(..., description="Role: 'system', 'user', or 'assistant'")
    content: str = Field(..., description="Message content")

class ChatRequest(BaseModel):
    model: str = Field(..., description="Model identifier (ignored, single model)")
    messages: List[Message] = Field(..., description="List of messages")
    max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
    stream: Optional[bool] = Field(False, description="Stream response (not supported)")

@app.post("/v1/chat/completions")
async def chat_completion(req: ChatRequest):
    # Ignore model name; use loaded model
    try:
        result = llm.create_chat_completion(
            messages=[m.dict() for m in req.messages],
            max_tokens=req.max_tokens,
            stream=req.stream,
        )
        return JSONResponse(content=result)
    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))

@app.get("/healthz")
async def healthz():
    return {"status": "ok"}

# For HF Spaces compatibility
if __name__ == "__main__":
    import uvicorn
    port = int(os.getenv("PORT", 7860))
    uvicorn.run(app, host="0.0.0.0", port=port)