import os
import requests
import httpx
from fastapi import FastAPI, HTTPException, Security, Header
from pydantic import BaseModel
from typing import Optional
from llama_cpp import Llama
from fastembed import TextEmbedding

app = FastAPI()

QDRANT_URL = os.environ["QDRANT_URL"].rstrip("/")
QDRANT_API_KEY = os.environ["QDRANT_API_KEY"]
COLLECTION = "well_vectors"

SERVICE_API_KEY = os.environ.get("SERVICE_API_KEY")
SERVICE_API_URL = "https://api.groq.com/openai/v1/chat/completions"
SERVICE_MODEL = "llama-3.3-70b-versatile"

EDYX_ACCESS_TOKEN = os.environ.get("EDYX_ACCESS_TOKEN")

PHYSICS_SYSTEM_PROMPT = """You are an expert physics researcher and teacher.
You are given retrieved scientific material from a physics knowledge base.
Your job:
- Use the retrieved material as grounding evidence
- Ignore irrelevant technical artifacts (paths, array shapes, file names)
- If information is incomplete, use your physics knowledge to complete the explanation
- Do NOT invent specific papers, experiments, or citations
- Produce a clean, coherent, human-readable explanation
Style: Clear, structured, graduate-level physics understanding."""

local_llm = None

def get_local_llm():
    global local_llm
    if local_llm is None:
        print("Loading local fallback model...")
        local_llm = Llama(
            model_path="/app/model.gguf",
            n_ctx=4096,
            n_threads=2,
            n_batch=128,
        )
    return local_llm

embedder = TextEmbedding(
    model_name="BAAI/bge-large-en-v1.5",
)

class QueryRequest(BaseModel):
    question: str
    top_k: Optional[int] = 5
    max_tokens: Optional[int] = 512

async def verify_token(x_edyx_token: str = Header(None)):
    if EDYX_ACCESS_TOKEN and x_edyx_token != EDYX_ACCESS_TOKEN:
        raise HTTPException(status_code=403, detail="Unauthorized: Invalid Access Token")
    return x_edyx_token

@app.get("/")
def root():
    return {"status": "edyx-phy running", "mode": "accelerated-primary"}

def search_qdrant(question: str, top_k: int):
    vector = [float(x) for x in next(embedder.embed(question))]

    r = requests.post(
        f"{QDRANT_URL}/collections/{COLLECTION}/points/search",
        headers={
            "Content-Type": "application/json",
            "api-key": QDRANT_API_KEY,
        },
        json={
            "vector": vector,
            "limit": top_k,
            "with_payload": True,
        },
        timeout=30,
    )

    if r.status_code != 200:
        return None, f"Qdrant search failed: {r.text}"

    hits = r.json().get("result", [])
    
    collected = []
    for h in hits:
        payload = h.get("payload", {})
        if "content" in payload:
            collected.append(str(payload["content"]))
        if "text" in payload:
            collected.append(str(payload["text"]))

    context = "\n\n".join(collected)[:12000]
    return context, len(hits)

async def call_service_api(question: str, context: str, max_tokens: int):
    if not SERVICE_API_KEY:
        raise Exception("Service API key not configured")
    
    user_prompt = f"""CONTEXT (retrieved evidence):
{context}
QUESTION:
{question}
Now produce a high-quality physics explanation that a serious learner would trust."""

    async with httpx.AsyncClient(timeout=60.0) as client:
        response = await client.post(
            SERVICE_API_URL,
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {SERVICE_API_KEY}"
            },
            json={
                "model": SERVICE_MODEL,
                "messages": [
                    {"role": "system", "content": PHYSICS_SYSTEM_PROMPT},
                    {"role": "user", "content": user_prompt}
                ],
                "max_tokens": max_tokens,
                "temperature": 0.2
            }
        )
        
        if response.status_code != 200:
            raise Exception(f"Service API error: {response.status_code} - {response.text}")
        
        data = response.json()
        return data["choices"][0]["message"]["content"]

def call_local_model(question: str, context: str, max_tokens: int):
    llm = get_local_llm()
    
    prompt = f"""
You are an expert physics researcher and teacher.
You are given raw, fragmented scientific material retrieved from a large physics knowledge base.
This material may include:
- incomplete sentences
- dataset paths or filenames
- low-level implementation details
- broken or partial explanations
Your job:
- Use the retrieved material as grounding evidence
- Ignore irrelevant technical artifacts (paths, array shapes, file names)
- If the retrieved information is incomplete, use your physics knowledge to complete the explanation
- Do NOT invent specific papers, experiments, or citations
- Do NOT mention datasets, storage paths, or indexing systems
- Produce a clean, coherent, human-readable explanation
Style rules:
- Clear, structured explanation
- Intuitive where possible
- Graduate-level physics understanding
- Text-first (formulas only if they genuinely help)
- No raw fragments, no broken sentences
CONTEXT (retrieved evidence):
{context}
QUESTION:
{question}
Now produce a high-quality physics explanation that a serious learner would trust.
"""

    out = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=0.2,
        top_p=0.9,
        stop=["SOURCE:", "QUESTION:"],
    )
    
    return out["choices"][0]["text"].strip()

@app.post("/v1/query", dependencies=[Security(verify_token)])
async def query(req: QueryRequest):

    context, sources = search_qdrant(req.question, req.top_k)
    
    if context is None:
        return {"error": "Qdrant search failed", "details": sources}
    
    if not context:
        return {"answer": "No relevant scientific data found.", "sources_used": 0}

    try:
        answer = await call_service_api(req.question, context, req.max_tokens)
        return {
            "answer": answer,
            "sources_used": sources,
            "source": "primary"
        }
    except Exception as e:
        print(f"Service API failed: {e}, falling back to local model...")
    

    try:
        answer = call_local_model(req.question, context, req.max_tokens)
        return {
            "answer": answer,
            "sources_used": sources,
            "source": "fallback"
        }
    except Exception as e:
        return {
            "answer": f"Error: Both primary and fallback failed. {str(e)}",
            "sources_used": 0,
            "source": "error"
        }