Spaces:

Group-1-5010
/

NotebookLM

Running

File size: 8,593 Bytes

"""Retrieval-only RAG engine for chat responses."""

from __future__ import annotations

import logging
import os
import re
from pathlib import Path

from ingestion_engine.embedding_generator import generate_query
from persistence.vector_store import VectorStore

logger = logging.getLogger(__name__)

K_RETRIEVE = 40
K_FINAL = 8
ALPHA = 0.05
MAX_SNIPPET_CHARS = 280
GEN_MODEL = "Qwen/Qwen2.5-7B-Instruct"
MAX_NEW_TOKENS = 400
TEMPERATURE = 0.2
TIMEOUT_SEC = 45

PROMPT_FILE = Path(__file__).resolve().parent.parent / "artifacts" / "prompt.poml"


def _parse_poml() -> tuple[str, str]:
    """Parse prompt.poml into (system_message, user_template)."""
    raw = PROMPT_FILE.read_text(encoding="utf-8")

    # System message: <role> + <item> rules inside <system>
    role_m = re.search(r"<role>(.*?)</role>", raw, re.DOTALL)
    role = role_m.group(1).strip() if role_m else ""
    items = re.findall(r"<item>(.*?)</item>", raw, re.DOTALL)
    rules = "\n".join(f"{i+1}) {it.strip()}" for i, it in enumerate(items))
    system_msg = f"{role}\nRules:\n{rules}" if role else rules

    # User template: content inside <template>
    tmpl_m = re.search(r"<template>(.*?)</template>", raw, re.DOTALL)
    user_template = tmpl_m.group(1).strip() if tmpl_m else "{{question}}\n\n{{context}}"

    return system_msg, user_template


def _clean_text(text: str) -> str:
    return " ".join((text or "").split())


def _tokenize_keywords(text: str) -> set[str]:
    tokens = re.split(r"[^a-z0-9]+", (text or "").lower())
    return {t for t in tokens if len(t) >= 3}


def _keyword_hit_count(query_keywords: set[str], chunk_text: str) -> int:
    if not query_keywords:
        return 0
    chunk_tokens = _tokenize_keywords(chunk_text)
    return len(query_keywords.intersection(chunk_tokens))


def _rerank_matches(query: str, matches: list[dict]) -> list[dict]:
    """Stage 2 rerank: pinecone score + ALPHA * lexical keyword hits."""
    query_keywords = _tokenize_keywords(query)
    rescored = []
    for m in matches:
        pinecone_score = float(m.get("score", 0.0) or 0.0)
        hits = _keyword_hit_count(query_keywords, m.get("text", ""))
        combined_score = pinecone_score + ALPHA * hits
        rescored.append(
            {
                **m,
                "keyword_hit_count": hits,
                "combined_score": combined_score,
            }
        )
    rescored.sort(key=lambda x: x.get("combined_score", 0.0), reverse=True)
    deduped = []
    seen = set()
    for item in rescored:
        key = (
            item.get("source_filename", ""),
            int(item.get("chunk_index", 0) or 0),
            _clean_text(item.get("text", "")),
        )
        if key in seen:
            continue
        seen.add(key)
        deduped.append(item)
        if len(deduped) >= K_FINAL:
            break
    return deduped


def _build_citations(matches: list[dict]) -> list[dict]:
    """Convert vector matches into the citation format used by pages/chat.py."""
    citations = []
    seen = set()

    for match in matches:
        source = match.get("source_filename", "Unknown source")
        chunk_index = int(match.get("chunk_index", 0) or 0)
        key = (source, chunk_index)
        if key in seen:
            continue
        seen.add(key)

        snippet = _clean_text(match.get("text", ""))
        if len(snippet) > MAX_SNIPPET_CHARS:
            snippet = snippet[:MAX_SNIPPET_CHARS].rstrip() + "..."

        citations.append(
            {
                "source": source,
                "page": chunk_index,
                "text": snippet,
            }
        )

    return citations


def _build_content(matches: list[dict]) -> str:
    if not matches:
        return (
            "I couldn't find relevant information in your uploaded sources for that question. "
            "Try rephrasing the question or adding more sources."
        )

    lines = ["Based on your uploaded sources, here are the most relevant passages:", ""]
    for idx, match in enumerate(matches, start=1):
        source = match.get("source_filename", "Unknown source")
        chunk_index = int(match.get("chunk_index", 0) or 0)
        score = float(match.get("score", 0.0) or 0.0)
        combined = float(match.get("combined_score", score) or score)
        hits = int(match.get("keyword_hit_count", 0) or 0)
        snippet = _clean_text(match.get("text", ""))
        if len(snippet) > MAX_SNIPPET_CHARS:
            snippet = snippet[:MAX_SNIPPET_CHARS].rstrip() + "..."

        lines.append(
            f"{idx}. **{source}** (chunk {chunk_index}, pinecone: {score:.3f}, hits: {hits}, combined: {combined:.3f})"
        )
        lines.append(f"   {snippet}")
        lines.append("")

    lines.append("This is a two-stage retrieval-only response (no LLM synthesis yet).")
    return "\n".join(lines)


def _build_context_text(reranked_matches: list[dict]) -> str:
    """Build formatted context from reranked chunks."""
    blocks = []
    for idx, match in enumerate(reranked_matches, start=1):
        source = match.get("source_filename", "Unknown source")
        chunk_index = int(match.get("chunk_index", 0) or 0)
        text = _clean_text(match.get("text", ""))
        blocks.append(f"[S{idx}] source={source} chunk={chunk_index}\n{text}")
    return "\n\n".join(blocks)


def _build_user_message(question: str, reranked_matches: list[dict], history: list[dict], max_history=5) -> str:
    """Build the user message from POML template + chat history + context."""
    _, user_template = _parse_poml()
    context_text = _build_context_text(reranked_matches)

    prompt = user_template.replace("{{question}}", question).replace("{{context}}", context_text)

    if history:
        history_text = ""
        for msg in history[-max_history:]:
            role = msg.get("role", "").capitalize()
            content = msg.get("content", "")
            history_text += f"{role}: {content}\n"
        return f"CHAT HISTORY:\n{history_text}\n{prompt}"

    return prompt


def _generate_answer(question: str, context_chunks: list[dict], chat_history: list[dict]) -> str:
    """Generate a grounded response using Hugging Face Inference API + chat history."""
    from huggingface_hub import InferenceClient

    token = os.environ.get("HF_TOKEN")
    client = InferenceClient(token=token, timeout=TIMEOUT_SEC)

    system_msg, _ = _parse_poml()
    user_msg = _build_user_message(question, context_chunks, chat_history)

    response = client.chat_completion(
        model=GEN_MODEL,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        max_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
    )
    content = response.choices[0].message.content if response and response.choices else ""
    return (content or "").strip()


def rag_answer(question: str, notebook_id: str, chat_history: list[dict] = None) -> dict:
    """Return a retrieval-only answer object: {"content": str, "citations": list}, using chat history"""
    q = (question or "").strip()
    if not q:
        return {"content": "Please enter a question.", "citations": []}

    try:
        query_vector = generate_query(q)
        # Stage 1: retrieve candidate pool
        matches = VectorStore().query(query_vector=query_vector, namespace=notebook_id, top_k=K_RETRIEVE)
        candidates = [m for m in matches if m.get("text")]
        if not candidates:
            return {
                "content": (
                    "I couldn't find relevant information in your uploaded sources for that question. "
                    "Try rephrasing the question or adding more sources."
                ),
                "citations": [],
            }

        # Stage 2: rerank and keep top K_FINAL
        final_matches = _rerank_matches(q, candidates)
        citations = _build_citations(final_matches)
        retrieval_only = _build_content(final_matches)

        try:
            generated = _generate_answer(q, final_matches, chat_history or [])
            content = generated or retrieval_only
        except Exception as e:
            logger.warning("Generation failed, falling back to retrieval-only content: %s", e)
            content = retrieval_only

        return {
            "content": content,
            "citations": citations,
        }

    except Exception as e:
        logger.error("RAG retrieval failed: %s", e)
        return {
            "content": f"I ran into an error while retrieving from sources: {e}",
            "citations": [],
        }