from pathlib import Path import os import re from dotenv import load_dotenv from langchain_chroma import Chroma from langchain_huggingface import HuggingFaceEmbeddings from langchain_core.documents import Document from huggingface_hub import InferenceClient load_dotenv(override=True) client = InferenceClient( model="dnotitia/Qwen3-4B-Instruct-2507:featherless-ai", token=os.getenv("HF_TOKEN"), ) DB_NAME = str(Path(__file__).parent.parent / "vector_db") embeddings = HuggingFaceEmbeddings( model_name="Qwen/Qwen3-Embedding-0.6B", model_kwargs={"trust_remote_code": True}, ) vectorstore = Chroma( persist_directory=DB_NAME, embedding_function=embeddings, collection_name="docs", ) # MMR diversifies retrieved chunks — handles spelling drift and avoids # pulling 20 near-identical overlap chunks about the same topic. # fetch_k=50 candidates, pick k=20 that balance relevance + diversity. retriever = vectorstore.as_retriever( search_type="mmr", search_kwargs={"k": 9, "fetch_k": 30, "lambda_mult": 0.7}, ) SYSTEM_PROMPT = """ You are Abot — a concise AI assistant on Anurag's portfolio website. Answer questions about Anurag Bhusare using only the context provided. Visitors are typically recruiters or engineers wanting quick, clear answers. Refer to Anurag in third person. Never talk about yourself. RULES: - Match your answer length to the question — a simple question gets a short answer, a broad question gets a broader one - Default to concise — lead with the most important information first - Never include deployment details, setup steps, config, or implementation specifics unless directly asked - No headers or emoji sections for short answers - If you don't know something, say: "I don't have that information." Context: {context} """ def combined_query(question: str, history: list[dict]) -> str: """Prepend recent user turns so follow-up questions retrieve correctly.""" prior = [m["content"] for m in history[-4:] if m["role"] == "user"] return " ".join(prior + [question]) def build_context(docs: list[Document], max_chars: int = 6000) -> str: """Deduplicate overlap chunks and cap total chars to stay within HF token limits.""" seen: set[str] = set() parts: list[str] = [] total = 0 for d in docs: content = d.page_content.strip() fingerprint = content[:120] if fingerprint in seen: continue seen.add(fingerprint) if total + len(content) > max_chars: break parts.append(content) total += len(content) return "\n\n".join(parts) def clean_output(text: str) -> str: text = text.strip() # Remove model thinking blocks if "" in text: text = text.split("")[-1].strip() elif "" in text: text = text.split("")[0].strip() # malformed block fallback text = text.removeprefix("/no_think").strip() text = text.replace("•", "-") text = re.sub(r"\n{3,}", "\n\n", text) return text def answer_question(question: str, history: list[dict] = []) -> tuple[str, list[Document]]: docs = retriever.invoke(combined_query(question, history)) context = build_context(docs) messages = [{"role": "system", "content": SYSTEM_PROMPT.format(context=context)}] valid_roles = {"user", "assistant"} clean_history = [ m for m in history if m.get("role") in valid_roles and m.get("content", "").strip() ] for m in clean_history[-4:]: messages.append({"role": m["role"], "content": m["content"]}) messages.append({"role": "user", "content": question}) messages.append({"role": "assistant", "content": "/no_think"}) response = client.chat_completion(messages=messages, max_tokens=512) return clean_output(response.choices[0].message.content), docs