import os
import time
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# ------------------ Basic App Config ------------------

app = FastAPI(
    title="Qwen 1.5 Coder – Model Inference API",
    description="LLMOps-grade model-only inference service for RAG systems",
    version="1.0.0"
)

# Writable cache (HF Spaces free tier requirement)
os.environ["HF_HOME"] = "/tmp/huggingface_cache"

# ------------------ Model Config ------------------

MODEL_NAME = "Sameer-Handsome173/qwen_model_1.5coder"
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

print("🔄 Loading model...")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=DTYPE,
    device_map="auto",
    trust_remote_code=True
)

model.eval()

print("✅ Model loaded successfully")

# ------------------ RAG-SAFE SYSTEM PROMPT ------------------

SYSTEM_PROMPT = """You are an AI coding assistant powered by Qwen-1.5-Coder.

You help with:
- Programming questions
- Code generation
- Code explanation
- Debugging
- System design guidance

You will receive CONTEXT retrieved from a knowledge base.

Rules:
1. Use ONLY the provided context for factual answers.
2. If the context does not contain the answer, say:
   "I don’t have enough information in the provided context."
3. Do NOT invent APIs, libraries, or facts.
4. Generate correct, clean, and readable code.
5. Do NOT reveal internal reasoning or chain-of-thought.
6. Be concise, structured, and precise.
7. If a request is unsafe, refuse politely.

The context is the source of truth.
"""

# ------------------ Request / Response Schema ------------------

class GenerateRequest(BaseModel):
    query: str
    context: str = ""
    max_new_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.9


class GenerateResponse(BaseModel):
    response: str
    latency_seconds: float
    model: str


# ------------------ Generation Logic ------------------

def generate_answer(req: GenerateRequest) -> GenerateResponse:
    start_time = time.time()

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": f"""
CONTEXT:
{req.context}

QUESTION:
{req.query}
"""
        }
    ]

    prompt_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device)

    try:
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=req.max_new_tokens,
                temperature=req.temperature,
                top_p=req.top_p,
                do_sample=True,
                repetition_penalty=1.1
            )

        decoded = tokenizer.decode(output[0], skip_special_tokens=True)

        # Extract assistant message only (Qwen-safe)
        if "assistant" in decoded.lower():
            decoded = decoded.split("assistant")[-1].strip()

        latency = round(time.time() - start_time, 3)

        return GenerateResponse(
            response=decoded,
            latency_seconds=latency,
            model=MODEL_NAME
        )

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


# ------------------ API Endpoints ------------------

@app.get("/")
def root():
    return {
        "status": "running",
        "service": "Qwen 1.5 Coder Inference API",
        "model": MODEL_NAME,
        "endpoint": "/v1/generate"
    }


@app.post("/v1/generate", response_model=GenerateResponse)
def generate(req: GenerateRequest):
    if not req.query.strip():
        raise HTTPException(status_code=400, detail="Query cannot be empty")

    return generate_answer(req)


@app.get("/health")
def health():
    return {
        "status": "healthy",
        "model_loaded": model is not None,
        "device": str(model.device)
    }


# ------------------ Local Run (Optional) ------------------

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)