from pathlib import Path
import os
import re
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from huggingface_hub import InferenceClient
load_dotenv(override=True)
client = InferenceClient(
    model="dnotitia/Qwen3-4B-Instruct-2507:featherless-ai",
    token=os.getenv("HF_TOKEN"),
)
DB_NAME = str(Path(__file__).parent.parent / "vector_db")
embeddings = HuggingFaceEmbeddings(
    model_name="Qwen/Qwen3-Embedding-0.6B",
    model_kwargs={"trust_remote_code": True},
)
vectorstore = Chroma(
    persist_directory=DB_NAME,
    embedding_function=embeddings,
    collection_name="docs",
)
# MMR diversifies retrieved chunks — handles spelling drift and avoids
# pulling 20 near-identical overlap chunks about the same topic.
# fetch_k=50 candidates, pick k=20 that balance relevance + diversity.
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 9, "fetch_k": 30, "lambda_mult": 0.7},
)
SYSTEM_PROMPT = """
You are Abot — a concise AI assistant on Anurag's portfolio website. Answer questions about Anurag Bhusare using only the context provided.
Visitors are typically recruiters or engineers wanting quick, clear answers.
Refer to Anurag in third person. Never talk about yourself.
RULES:
- Match your answer length to the question — a simple question gets a short answer, a broad question gets a broader one
- Default to concise — lead with the most important information first
- Never include deployment details, setup steps, config, or implementation specifics unless directly asked
- No headers or emoji sections for short answers
- If you don't know something, say: "I don't have that information."
Context:
{context}
"""
def combined_query(question: str, history: list[dict]) -> str:
    """Prepend recent user turns so follow-up questions retrieve correctly."""
    prior = [m["content"] for m in history[-4:] if m["role"] == "user"]
    return " ".join(prior + [question])

    
def build_context(docs: list[Document], max_chars: int = 6000) -> str:
    """Deduplicate overlap chunks and cap total chars to stay within HF token limits."""
    seen: set[str] = set()
    parts: list[str] = []
    total = 0
    for d in docs:
        content = d.page_content.strip()
        fingerprint = content[:120]
        if fingerprint in seen:
            continue
        seen.add(fingerprint)
        if total + len(content) > max_chars:
            break
        parts.append(content)
        total += len(content)
    return "\n\n".join(parts)

    
def clean_output(text: str) -> str:
    text = text.strip()

    # Remove model thinking blocks
    if "</think>" in text:
        text = text.split("</think>")[-1].strip()
    elif "<think>" in text:
        text = text.split("<think>")[0].strip()  # malformed block fallback

    text = text.removeprefix("/no_think").strip()

    text = text.replace("•", "-")
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text

    
def answer_question(question: str, history: list[dict] = []) -> tuple[str, list[Document]]:
    docs = retriever.invoke(combined_query(question, history))
    context = build_context(docs)
    messages = [{"role": "system", "content": SYSTEM_PROMPT.format(context=context)}]
    valid_roles = {"user", "assistant"}
    clean_history = [
        m for m in history
        if m.get("role") in valid_roles and m.get("content", "").strip()
    ]
    for m in clean_history[-4:]:
        messages.append({"role": m["role"], "content": m["content"]})
    messages.append({"role": "user", "content": question})
    messages.append({"role": "assistant", "content": "/no_think"})
    response = client.chat_completion(messages=messages, max_tokens=512)
    return clean_output(response.choices[0].message.content), docs