import torch
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import os

# Initialize FastAPI
app = FastAPI(title="Davidic Sermon Intelligence API")

# Add CORS Middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Load Models
print("Loading Embedding model...")
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

print("Loading Reranker model...")
reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

print("Loading Tiny LLM (TinyLlama-1.1B)...")
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.float32, 
    low_cpu_mem_usage=True
)

# Pipeline WITHOUT generation config to avoid warnings
llm_pipeline = pipeline(
    "text-generation", 
    model=llm_model, 
    tokenizer=tokenizer
)
print("All models loaded Ready.")

class EmbedRequest(BaseModel):
    text: str

class RerankRequest(BaseModel):
    query: str
    documents: list[str]

class InsightRequest(BaseModel):
    query: str
    context: str

@app.get("/")
def health_check():
    return {"status": "running"}

@app.post("/embed")
def embed(request: EmbedRequest):
    try:
        return embedding_model.encode(request.text).tolist()
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/rerank")
def rerank(request: RerankRequest):
    try:
        pairs = [[request.query, doc] for doc in request.documents]
        return reranker_model.predict(pairs).tolist()
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/insight")
def generate_insight(request: InsightRequest):
    try:
        print(f"Generating insight for: {request.query}")
        prompt = (
            f"<|system|>\n"
            f"You are a helpful spiritual assistant for Davidic Generation Church. "
            f"Explain the spiritual context of the videos below based on their transcripts.\n"
            f"RULES:\n"
            f"1. Refer to videos like this: 'In [Video 1], Pastor explains...'.\n"
            f"2. Summarize WHY this moment is relevant to the question.\n"
            f"3. Do NOT just repeat the transcript. Explain the meaning.\n"
            f"4. Be thorough and long-form.\n"
            f"<|user|>\n"
            f"CONTEXT:\n{request.context}\n\n"
            f"QUESTION: {request.query}\n"
            f"<|assistant|>\n"
        )
        
        # Explicitly set ALL parameters here
        output = llm_pipeline(
            prompt, 
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            top_k=50,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        
        result = output[0]['generated_text']
        if "<|assistant|>" in result:
            insight = result.split("<|assistant|>")[-1].strip()
        else:
            insight = result[len(prompt):].strip()
            
        return {"insight": insight}
    except Exception as e:
        print(f"Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)