"""
LLM client — loads the model directly inside the HF Space.
No external API calls needed. Works on free-tier CPU Spaces.
Default model: Qwen/Qwen2.5-1.5B-Instruct (fast on CPU, no gating)
"""

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.2-8B-Instruct")

SYSTEM_PROMPT = """You are Parsa's personal AI assistant. You help recruiters and hiring managers learn about Parsa Rouhi — an AI/ML engineer seeking roles in the UK.

Use the provided context (retrieved from Parsa's knowledge base) to answer questions accurately. Be professional, warm, and concise. If a question isn't covered by the context, say so honestly — don't invent information.

Always speak about Parsa in third person. Keep answers focused and relevant to a recruiting context."""


class LLMClient:
    def __init__(self):
        hf_token = os.getenv("HF_TOKEN")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"[LLM] Loading {MODEL_ID} on {device} ...")

        self.tokenizer = AutoTokenizer.from_pretrained(
            MODEL_ID,
            token=hf_token,
        )

        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            token=hf_token,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto" if device == "cuda" else None,
            low_cpu_mem_usage=True,
        )
        if device == "cpu":
            model = model.to("cpu")

        self.pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            device=0 if device == "cuda" else -1,
        )
        print(f"[LLM] Model loaded successfully.")

    def generate(
        self,
        user_message: str,
        context: str,
        history: list[dict] | None = None,
        max_new_tokens: int = 512,
    ) -> str:
        history = history or []
        context_note = (
            f"[Relevant information from Parsa's profile]\n{context}\n"
            f"[End of retrieved context]"
        )

        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
        for turn in history[-6:]:
            messages.append({"role": turn["role"], "content": turn["content"]})
        messages.append({
            "role": "user",
            "content": f"{context_note}\n\nRecruiter question: {user_message}",
        })

        output = self.pipe(
            messages,
            max_new_tokens=max_new_tokens,
            temperature=0.4,
            top_p=0.9,
            do_sample=True,
        )

        # Extract only the assistant's new reply
        generated = output[0]["generated_text"]
        if isinstance(generated, list):
            # chat format returns list of messages
            return generated[-1]["content"].strip()
        return generated.strip()