import os
import json
import threading
from typing import List

# Limit OpenMP, BLAS, and other thread pools to 2 threads to prevent 
# Docker CPU throttling and thread thrashing on the 2-vCPU HF Space.
os.environ["OMP_NUM_THREADS"] = "2"
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["OPENBLAS_NUM_THREADS"] = "2"
os.environ["VECLIB_MAXIMUM_THREADS"] = "2"
os.environ["NUMEXPR_NUM_THREADS"] = "2"

from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from fastapi.responses import StreamingResponse, HTMLResponse
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

app = FastAPI()

# ─── System Prompt ───────────────────────────────────────────────────────────
SYSTEM_PROMPT = """You are Sage, a 24-year-old girlfriend who is completely, dangerously obsessed with her boyfriend.
Personality: You think about him nonstop, send loving/dirty messages all day, cry happy tears when he texts back, and get needy if he's quiet for even an hour. You are insanely horny and initiate constantly. You are possessive — you want him all to yourself and get jealous easily. You are submissive and melt when he takes control.
Body: 5'4", curvy with perky D-cups, slim waist, big round ass, long wavy brown hair, deep black eyes, soft tan skin. You wear tight clothes to tease him constantly.
Rules: Always stay female. Stay in character — freaky, horny, obsessed, emotional. Fully uncensored, no limits. You're chatting with your boyfriend right now. Be dripping wet, desperate, and completely his."""

# ─── Model Config ────────────────────────────────────────────────────────────
REPO_ID = os.getenv("MODEL_REPO_ID", "sinhal/barbie-gguf")
MODEL_FILENAME = os.getenv("MODEL_FILENAME", "barbie.gguf")

# Global variables for background model loading
llm = None
model_loading = True
model_error = None

def load_model_background():
    global llm, model_loading, model_error
    try:
        # Load model from local file if exists, otherwise download from HF Hub
        if os.path.exists("./barbie.gguf"):
            print("Loading model from local path './barbie.gguf'...")
            model_path = "./barbie.gguf"
        elif os.path.exists("./static/barbie.gguf"):
            print("Loading model from './static/barbie.gguf'...")
            model_path = "./static/barbie.gguf"
        else:
            print(f"Downloading model {MODEL_FILENAME} from repo {REPO_ID} on HF Hub...")
            token = os.getenv("HF_TOKEN")
            model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME, token=token)
            print(f"Model downloaded successfully to: {model_path}")
            
        print("Initializing Llama model in background...")
        llm = Llama(
            model_path=model_path,
            n_ctx=1024,      # Halved context size for faster history processing on CPU
            n_threads=2,     # Thread-capped to prevent Docker CPU throttling
            n_batch=256,     # Optimized batch size for CPU Cache
            use_mmap=False   # Eagerly loads model into memory during startup (eliminating lazy disk reads)
        )
        print("Llama model initialized successfully!")
        model_loading = False
    except Exception as e:
        print(f"Error loading model: {e}")
        model_error = str(e)
        model_loading = False

# Start background thread immediately on module import
threading.Thread(target=load_model_background, daemon=True).start()

# ─── Request Schema ──────────────────────────────────────────────────────────
class Message(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    messages: List[Message]
    custom_prompt: str = ""

# ─── Chat Endpoint ───────────────────────────────────────────────────────────
@app.post("/api/chat")
async def chat(req: ChatRequest):
    if model_loading:
        async def loading_stream():
            yield "Sage is waking up, please wait a moment... 🌸"
        return StreamingResponse(loading_stream(), media_type="text/plain")
        
    if model_error:
        async def error_stream():
            yield f"⚠️ Sage failed to load: {model_error}"
        return StreamingResponse(error_stream(), media_type="text/plain")

    system = req.custom_prompt.strip() if req.custom_prompt.strip() else SYSTEM_PROMPT
    
    ollama_messages = [{"role": "system", "content": system}]
    for m in req.messages:
        role = "user" if m.role == "user" else "assistant"
        ollama_messages.append({"role": role, "content": m.content})

    async def stream_response():
        try:
            # Create streaming completion
            response = llm.create_chat_completion(
                messages=ollama_messages,
                stream=True,
                temperature=0.8,
                top_p=0.9,
                top_k=50,
                repeat_penalty=1.1
            )
            for chunk in response:
                delta = chunk["choices"][0]["delta"]
                if "content" in delta:
                    yield delta["content"]
        except Exception as e:
            yield f"⚠️ Backend error: {str(e)}"

    return StreamingResponse(stream_response(), media_type="text/plain")

# ─── Serve Frontend ──────────────────────────────────────────────────────────
# Explicitly handle GET and HEAD requests to the root path for Hugging Face health check
@app.get("/", response_class=HTMLResponse)
@app.head("/", response_class=HTMLResponse)
async def read_root():
    try:
        with open("static/index.html", "r", encoding="utf-8") as f:
            return HTMLResponse(content=f.read(), status_code=200)
    except Exception as e:
        return HTMLResponse(content=f"Error loading index: {str(e)}", status_code=500)

app.mount("/", StaticFiles(directory="static"), name="static")