Spaces:

LightRT
/

pdf_rag

Sleeping

App Files Files Community

LightRT commited on 4 days ago

Commit

9cc7f8d

0 Parent(s):

Initial completely clean deployment

Browse files

Files changed (23) hide show

.dockerignore +15 -0
.gitignore +12 -0
.python-version +1 -0
README.md +0 -0
app.py +92 -0
backend.Dockerfile +20 -0
docker-compose.yaml +26 -0
frontend.Dockerfile +14 -0
main.py +6 -0
pyproject.toml +26 -0
requirements.txt +0 -0
src/__pycache__/embedding.cpython-312.pyc +0 -0
src/__pycache__/graph.cpython-312.pyc +0 -0
src/__pycache__/ingestion.cpython-312.pyc +0 -0
src/__pycache__/main.cpython-312.pyc +0 -0
src/__pycache__/retrieval.cpython-312.pyc +0 -0
src/embedding.py +92 -0
src/fix_db.py +25 -0
src/graph.py +193 -0
src/ingestion.py +46 -0
src/main.py +84 -0
src/retrieval.py +95 -0
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Virtual Environments
+.venv/
+venv/
+env/
+# Python Cache
+__pycache__/
+*.pyc
+*.pyo
+# Git
+.git/
+# Ignore local database files if Qdrant creates any locally
+qdrant_storage/

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.env
+.venv/
+venv/
+env/
+__pycache__/
+*.pyc
+data/uploads/
+.DS_Store

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import streamlit as st
+import requests
+import uuid
+# 1. PAGE CONFIGURATION
+st.set_page_config(page_title="Enterprise RAG Assistant", page_icon="🤖", layout="centered")
+st.title("📚 Enterprise Document Assistant")
+st.markdown("Upload a PDF to the knowledge base and ask questions about it.")
+# 2. SESSION STATE INITIALIZATION (The Memory Bank)
+if "user_id" not in st.session_state:
+    st.session_state.user_id = str(uuid.uuid4())
+if "thread_id" not in st.session_state:
+    st.session_state.thread_id = str(uuid.uuid4())
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# 3. SIDEBAR: PDF UPLOAD (The Handoff to FastAPI)
+with st.sidebar:
+    st.header("Document Ingestion")
+    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
+    if st.button("Process Document"):
+        if uploaded_file:
+            with st.spinner("Transmitting to backend..."):
+                # Package the file as multipart/form-data
+                files = {"file": (uploaded_file.name, uploaded_file.getvalue(), "application/pdf")}
+                payload_data = {"user_id": st.session_state.user_id}
+                # Send the POST request to your local FastAPI server
+                try:
+                    response = requests.post(
+                        "http://backend:8000/upload",
+                        files=files,
+                        data=payload_data
+                    )
+                    if response.status_code == 200:
+                        st.success("File uploaded! The AI is reading it in the background.")
+                    else:
+                        st.error(f"Upload failed: {response.text}")
+                except requests.exceptions.ConnectionError:
+                    st.error("Cannot connect to backend. Is FastAPI running?")
+        else:
+            st.warning("Please select a file first.")
+# 4. CHAT HISTORY RENDERING
+for msg in st.session_state.messages:
+    # This creates a chat bubble. role is either 'user' or 'assistant'
+    with st.chat_message(msg["role"]):
+        st.markdown(msg["content"])
+# 5. CHAT INPUT & BACKEND COMMUNICATION
+if prompt := st.chat_input("Ask a question about your documents..."):
+    # Immediately render the user's new message to the UI
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Show a loading indicator while we wait for FastAPI and LangGraph
+    with st.chat_message("assistant"):
+        message_placeholder = st.empty()
+        message_placeholder.markdown("*(Thinking...)*")
+        # Prepare the JSON payload for FastAPI
+        payload = {
+            "message": prompt,
+            "user_id": st.session_state.user_id,
+            "thread_id": st.session_state.thread_id
+        }
+        try:
+            # Send the question to your LangGraph backend
+            chat_response = requests.post("http://backend:8000/chat", json=payload)
+            if chat_response.status_code == 200:
+                # Extract the answer from the JSON response
+                answer = chat_response.json().get("response", "No response found.")
+                # Update the UI placeholder with the actual answer
+                message_placeholder.markdown(answer)
+                # Save the AI's answer to the session state memory
+                st.session_state.messages.append({"role": "assistant", "content": answer})
+            else:
+                message_placeholder.error(f"Error: {chat_response.text}")
+        except requests.exceptions.ConnectionError:
+            message_placeholder.error("Cannot connect to backend. Is FastAPI running?")

backend.Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use an official, lightweight Python image
+FROM python:3.11-slim
+# Set the working directory inside the container
+WORKDIR /app
+# Copy the requirements file and install dependencies
+# (We use standard pip inside the container because it's universally stable)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy all your project files into the container
+COPY . .
+# Expose the port FastAPI runs on
+# Change this
+EXPOSE 7860
+# And change this
+CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+version: '3.8'
+services:
+  backend:
+    build:
+      context: .
+      dockerfile: backend.Dockerfile
+    ports:
+      - "8000:8000"
+    env_file:
+      - .env
+    # This prevents the container from crashing immediately if it hits a tiny error
+    restart: always
+  frontend:
+    build:
+      context: .
+      dockerfile: frontend.Dockerfile
+    ports:
+      - "8501:8501"
+    env_file:
+      - .env
+    # Tells Docker to start the backend BEFORE it starts the frontend
+    depends_on:
+      - backend
+    restart: always

frontend.Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+# Expose the port Streamlit runs on
+EXPOSE 8501
+# The command to start the UI
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def main():
+    print("Hello from pdf-qa-chatbot!")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[project]
+name = "pdf-qa-chatbot"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "docling>=2.96.1",
+    "fastapi>=0.136.3",
+    "fastembed>=0.8.0",
+    "langchain>=1.3.2",
+    "langchain-community>=0.4.2",
+    "langchain-core>=1.4.0",
+    "langchain-openai>=1.2.2",
+    "langgraph>=1.2.2",
+    "langgraph-checkpoint-postgres>=3.1.0",
+    "langsmith>=0.8.8",
+    "psycopg[binary]>=3.3.4",
+    "pydantic>=2.13.4",
+    "python-dotenv>=1.2.2",
+    "qdrant-client>=1.18.0",
+    "streamlit>=1.58.0",
+    "tavily>=1.1.0",
+    "transformers>=5.9.0",
+    "uuid>=1.30",
+]

requirements.txt ADDED Viewed

Binary file (560 Bytes). View file

src/__pycache__/embedding.cpython-312.pyc ADDED Viewed

Binary file (4.18 kB). View file

src/__pycache__/graph.cpython-312.pyc ADDED Viewed

Binary file (8.81 kB). View file

src/__pycache__/ingestion.cpython-312.pyc ADDED Viewed

Binary file (1.91 kB). View file

src/__pycache__/main.cpython-312.pyc ADDED Viewed

Binary file (4.25 kB). View file

src/__pycache__/retrieval.cpython-312.pyc ADDED Viewed

Binary file (4.09 kB). View file

src/embedding.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from src.ingestion import ingestion_and_chunking
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, SparseVectorParams, PointStruct
+from fastembed import SparseTextEmbedding
+import uuid
+from dotenv import load_dotenv
+import os
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+load_dotenv()
+qdrant_api_key = os.getenv("QDRANT_API_KEY")
+qdrant_url = os.getenv("QDRANT_URL")
+hf_token = os.getenv("HF_TOKEN")
+def upload_file(file_path: str, user_id: str, collection_name="pdf_rag_chat"):
+    client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
+    dense_model = HuggingFaceInferenceAPIEmbeddings(
+        api_key=hf_token,
+        model_name="sentence-transformers/all-MiniLM-L6-v2")
+    sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
+    # 1. ONLY the database creation should be inside this IF block
+    if not client.collection_exists(collection_name):
+        client.create_collection(
+            collection_name=collection_name,
+            vectors_config={
+                "dense": VectorParams(size=384, distance=Distance.COSINE)
+            },
+            sparse_vectors_config={
+                "sparse": SparseVectorParams()
+            }
+        )
+    # 2. EVERYTHING ELSE MUST BE UN-INDENTED SO IT RUNS EVERY TIME
+    try:
+        docs = ingestion_and_chunking(file_path)
+        texts = [doc.page_content for doc in docs]
+        dense_vectors = dense_model.embed_documents(texts)
+        sparse_vectors = list(sparse_model.embed(texts))
+        points = []
+        file_id = str(uuid.uuid4())
+        for i, doc in enumerate(docs):
+            # 1. Convert numpy array to standard Python list
+            dense_vec = dense_vectors[i]
+            # 2. Extract indices and values from FastEmbed's custom object
+            sparse_emb = sparse_vectors[i]
+            sparse_vec = {
+                "indices": sparse_emb.indices.tolist(),
+                "values": sparse_emb.values.tolist()
+            }
+            chunk_id = str(uuid.uuid4())
+            point = PointStruct(
+                id=chunk_id, # Reusing the same file_id so all chunks tie back to one file
+                vector={
+                    'dense': dense_vec,
+                    'sparse': sparse_vec
+                },
+                payload={
+                    'user_id': user_id,
+                    'file_id': file_id,
+                    'text': doc.page_content,
+                    "source": doc.metadata.get("source"),
+                    "pages": doc.metadata.get("pages"),
+                    "section": doc.metadata.get("section")
+                }
+            )
+            points.append(point)
+        # (Optional but safe) Tell Qdrant to index it just in case
+        try:
+            client.create_payload_index(
+                collection_name=collection_name,
+                field_name="user_id",
+                field_schema="keyword"
+            )
+        except Exception:
+            pass
+        # Send to database
+        client.upsert(collection_name=collection_name, points=points)
+    except Exception as e:
+        print("\n" + "!"*60, flush=True)
+        print(f"❌ UPLOAD FAILED SILENTLY IN BACKGROUND:", flush=True)
+        print(f"{str(e)}", flush=True)
+        print("!"*60 + "\n", flush=True)

src/fix_db.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from qdrant_client import QdrantClient
+from dotenv import load_dotenv
+load_dotenv()
+client = QdrantClient(
+    url=os.getenv("QDRANT_URL"),
+    api_key=os.getenv("QDRANT_API_KEY")
+)
+# LOOK AT YOUR retrieval.py FILE AND COPY THE EXACT COLLECTION NAME HERE
+COLLECTION_NAME = "pdf_rag"
+print(f"Attempting to build index for '{COLLECTION_NAME}'...")
+try:
+    client.create_payload_index(
+        collection_name=COLLECTION_NAME,
+        field_name="user_id",
+        field_schema="keyword"
+    )
+    print("✅ Index built successfully! Qdrant is ready.")
+except Exception as e:
+    print(f"❌ FAILED: {e}")

src/graph.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from typing import TypedDict , Annotated , List
+from langgraph.graph.message import add_messages
+from langchain_core.messages import SystemMessage , HumanMessage
+from langchain_openai import ChatOpenAI
+import os
+from src.retrieval import Retriever
+import os
+from tavily import TavilyClient
+from dotenv import load_dotenv
+from langgraph.graph import StateGraph, START ,END
+from langgraph.checkpoint.postgres import PostgresSaver
+from psycopg_pool import ConnectionPool
+load_dotenv()
+class State(TypedDict) :
+    messages : Annotated[list , add_messages]
+    context : List[dict]
+    rewritten_query : str
+    user_id : str
+    web_search_needed : bool
+    retry : int
+llm = ChatOpenAI(
+    model="openai/gpt-4o-mini",
+    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
+    openai_api_base="https://openrouter.ai/api/v1",
+    temperature=0
+)
+retriever = Retriever()
+tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
+def rewrite_node(state : State) :
+    messages = state['messages']
+    # 1. Filter to only get the human's messages
+    user_msg = [msg for msg in messages if isinstance(msg , HumanMessage)]
+    # 2. Extract the actual text
+    latest_ques = user_msg[-1].content
+    history = "\n".join([msg.content for msg in user_msg[:-1]])
+    # 3. Set the strict system rules
+    system_prompt = SystemMessage(content="""You are an expert search query generator for a vector database.
+Your ONLY job is to convert the user's latest input into a single, highly optimized search string.
+You will receive a sequence of the user's previous questions, followed by their newest input.
+CRITICAL RULES:
+1. TRACK THE TRAIN OF THOUGHT: If the latest input uses pronouns (it, they, this) or is a fragment (e.g., "What about the budget?"), identify the core noun from the previous questions and substitute it.
+2. NO CONVERSATIONAL FILLER: Do not answer the question. Do not explain your reasoning.
+3. FORMAT: Output only the raw search keywords. No commas, no bullet points.
+Example Input:
+Chat History:
+What is the main objective of Project Chronos?
+Who is the lead engineer?
+Latest User Input: What is his total budget for Q4?
+Example Output: Project Chronos lead engineer budget
+""")
+    # 4. FIX: Package the history and question into a proper HumanMessage object
+    human_prompt = HumanMessage(content=f"Chat History: {history}\n\nLatest User Input: {latest_ques}\n\nGenerate the concise search query now:")
+    # 5. FIX: Combine them as a valid list of Message objects
+    final_msg = [system_prompt, human_prompt]
+    # 6. Invoke the LLM
+    response = llm.invoke(final_msg)
+    print("\n" + "="*60, flush=True)
+    print(f"\n ReQuery : \n{response.content} \n", flush=True)
+    print("="*60 + "\n", flush=True)
+    return {'rewritten_query' : response.content}
+def retrieve_node(state : State) :
+    user_id = state['user_id']
+    re_query = state['rewritten_query']
+    context = retriever.retrieve(re_query , user_id)
+    return{'context' : context}
+def answer_node(state : State) :
+    messages = state['messages']
+    context = state['context']
+    retry = state.get('retry' , 0)
+    context_text = ""
+    if not context:
+        context_text = "No relevant context found in the database for this specific query."
+    else:
+        for i, chunk in enumerate(context):
+            context_text += f"\n--- Document Chunk {i+1} ---\n"
+            context_text += f"Source: {chunk.get('source', 'Unknown')}\n"
+            context_text += f"Pages: {chunk.get('pages', 'N/A')}\n"
+            context_text += f"Section: {chunk.get('section', 'N/A')}\n"
+            context_text += f"Content: {chunk.get('text', '')}\n"
+    print("\n" + "="*60, flush=True)
+    print(f"\n\nCONTEXT TEXT :/n/n{context_text}", flush=True)
+    print("="*60 + "\n", flush=True)
+    if retry<1 :
+        system_prompt = SystemMessage(content=f"""
+        You are an advanced enterprise RAG assistant. Your job is to answer the user's latest question
+        by strictly analyzing the conversation history and the provided document chunks below.
+        CRITICAL RULES:
+        1. Base your answer ONLY on the text snippets provided in the Context section below. Do not assume or extrapolate.
+        2. If the context does not contain the answer, or if the context is irrelevant to the question,
+        you must reply with exactly this phrase and absolutely nothing else: FALLBACK_TO_WEB_SEARCH
+        3. You MUST inline cite your sources whenever you use information from a chunk.
+        Format your citations cleanly at the end of sentences like this: [Source: file.pdf, Page: X].
+        CONTEXT DATA:
+        {context_text}
+        """)
+    else :
+        system_prompt = f"""
+        You are an advanced enterprise RAG assistant. Your job is to answer the user's latest question
+        by strictly analyzing the conversation history and the provided document chunks below.
+        These chunks now include both internal documents and live web search results.
+        CRITICAL RULES:
+        1. Base your answer ONLY on the text snippets provided in the Context section below. Do not assume or extrapolate.
+        2. DO NOT ask for another web search. If the answer is still not found in the provided context, you must politely inform the user that the information is unavailable.
+        3. You MUST inline cite your sources whenever you use information from a chunk.
+           Format your citations cleanly at the end of sentences like this: [Source: file.pdf, Page: X] or [Source: website_url].
+        CONTEXT DATA:
+        {context_text}
+        """
+    final_msg = [system_prompt] + messages
+    response = llm.invoke(final_msg)
+    if response.content.strip() == "FALLBACK_TO_WEB_SEARCH":
+        return {"web_search_needed": True}
+    else:
+        return {"messages": [response],
+                "web_search_needed": False}
+def routing(state : State) :
+    if state["web_search_needed"] :
+        return "web_search_node"
+    else:
+        return "END"
+def web_search_node(state : State) :
+    re_query = state['rewritten_query']
+    context = state['context']
+    retry = state.get('retry' , 0)
+    response = tavily_client.search(query=re_query , max_results=3)
+    results = response['results']
+    web_context = []
+    for res in results :
+        web_context.append({
+            "text": res.get("content", ""),
+            "source": res.get("url", "Live Web Search"),
+            "pages": "N/A",
+            "section": "Internet Result"
+        })
+    combined = context + web_context
+    return {'context' : combined , 'retry' : retry+1}
+workflow = StateGraph(State)
+workflow.add_node("rewrite_node" , rewrite_node)
+workflow.add_node("retrieve_node" , retrieve_node)
+workflow.add_node("answer_node" , answer_node)
+workflow.add_node("web_search_node" , web_search_node)
+workflow.add_edge(START , "rewrite_node")
+workflow.add_edge("rewrite_node" , "retrieve_node")
+workflow.add_edge("retrieve_node" , "answer_node")
+workflow.add_conditional_edges(
+    "answer_node",
+    routing,
+    {"web_search_node": "web_search_node",
+    "END": END})
+workflow.add_edge("web_search_node" , "answer_node")

src/ingestion.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from docling.document_converter import DocumentConverter
+from docling.chunking import HybridChunker
+from transformers import AutoTokenizer
+from langchain_core.documents import Document
+from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
+def ingestion_and_chunking(file_path : str) :
+    converter = DocumentConverter()
+    result = converter.convert(file_path)
+    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+    chunker = HybridChunker(merge_peers=True ,
+                            chunk_size=800 ,
+                            overlap=200,
+                            tokenizer=tokenizer )
+    chunks = list(chunker.chunk(result.document))
+    for chunk in chunks :
+        chunk.text = chunker.contextualize(chunk)
+    docs = []
+    for chunk in chunks:
+        pages = sorted({
+            prov.page_no
+            for item in chunk.meta.doc_items
+            for prov in item.prov
+        })
+        docs.append(
+            Document(
+                page_content=chunk.text,
+                metadata={
+                    "source": chunk.meta.origin.filename,
+                    "pages": pages,
+                    "section": chunk.meta.headings[0] if chunk.meta.headings else None,
+                }
+            )
+        )
+    return docs

src/main.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from fastapi import FastAPI , HTTPException , UploadFile, File, BackgroundTasks , Form
+from pydantic import BaseModel , Field
+import os
+from dotenv import load_dotenv
+from src.graph import workflow
+from src.embedding import upload_file
+import shutil
+from langgraph.checkpoint.postgres import PostgresSaver
+from psycopg_pool import ConnectionPool
+load_dotenv()
+app = FastAPI(
+    title="Enterprise PDF RAG API",
+    description="A production-grade backend powering an intelligent LangGraph agent.",
+    version="1.0.0"
+)
+class ChatRequest(BaseModel):
+    message: str = Field(..., description="The raw message string from the user.")
+    user_id: str = Field(..., description="The unique identifier for the tenant context.")
+    thread_id: str = Field(..., description="The unique session ID tracking the short-term chat history.")
+@app.post("/chat", summary="Return an answer using the RAG backend to the user query.")
+async def chat_endpoint(request: ChatRequest):
+    try:
+        config = {'configurable': {'thread_id': request.thread_id}}
+        initial_state = {
+            "messages": [("user", request.message)],
+            "user_id": request.user_id
+        }
+        # 1. Grab the database URL
+        db_uri = os.getenv("DATABASE_URI")
+        # 2. Open a fresh, guaranteed-alive connection to Postgres
+        with PostgresSaver.from_conn_string(db_uri) as checkpointer:
+            # (Optional) Ensure tables exist
+            checkpointer.setup()
+            # 3. Compile the LangGraph blueprint with our fresh memory connection
+            agent = workflow.compile(checkpointer=checkpointer)
+            # 4. Run the AI pipeline
+            result = agent.invoke(initial_state, config=config)
+        # 5. Extract the AI's final answer
+        output_messages = result.get("messages", [])
+        if not output_messages:
+            raise ValueError("No messages returned from the graph.")
+        ai_response = output_messages[-1].content
+        return {
+            "status": "success",
+            "thread_id": request.thread_id,
+            "response": ai_response
+        }
+    except Exception as e:
+        print(f"Backend Error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Agent Processing Error: {str(e)}")
+UPLOAD_DIR = "data/uploads"
+os.makedirs(UPLOAD_DIR, exist_ok=True)
+@app.post("/upload", summary="Upload a PDF and process its embeddings in the background")
+async def upload_pdf(
+    background_tasks: BackgroundTasks,
+    file: UploadFile = File(...),
+    user_id : str = Form(...)
+):
+    local_file_path = os.path.join(UPLOAD_DIR, file.filename)
+    with open(local_file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    background_tasks.add_task(upload_file, local_file_path, user_id)
+    return {
+        "status": "success",
+        "message": f"'{file.filename}' received successfully. Ingestion pipeline started in the background."
+    }

src/retrieval.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os
+import requests
+from dotenv import load_dotenv
+from qdrant_client import QdrantClient
+from qdrant_client import models
+from fastembed import SparseTextEmbedding
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+load_dotenv()
+qdrant_api_key = os.getenv("QDRANT_API_KEY")
+qdrant_url = os.getenv("QDRANT_URL")
+hf_token = os.getenv("HF_TOKEN")
+class Retriever() :
+    def __init__(self , collection_name = 'pdf_rag_v3') :
+        self.collection_name = collection_name
+        self.client = QdrantClient(url=qdrant_url , api_key=qdrant_api_key)
+        # 🚨 THE FIX: Do NOT load models here. Let the server boot fast and light.
+        self.dense_model = None
+        self.sparse_model = None
+    def cloud_rerank(self, query, texts):
+        API_URL = "https://api-inference.huggingface.co/models/cross-encoder/ms-marco-MiniLM-L-6-v2"
+        headers = {"Authorization": f"Bearer {hf_token}"}
+        payload = {
+            "inputs": {
+                "source_sentence": query,
+                "sentences": texts
+            }
+        }
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload)
+            if response.status_code == 200:
+                return response.json()
+        except Exception as e:
+            print(f"Cloud reranker failed: {e}")
+            pass
+        return [0.0] * len(texts)
+    def retrieve(self , query : str , user_id : str) :
+        # 🚨 THE FIX: Lazy Load. Only turn the models on the very first time someone asks a question!
+        if self.dense_model is None:
+            self.dense_model = HuggingFaceInferenceAPIEmbeddings(
+                api_key=hf_token,
+                model_name="sentence-transformers/all-MiniLM-L6-v2"
+            )
+        if self.sparse_model is None:
+            self.sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
+        dense_query_vector = self.dense_model.embed_query(query)
+        sparse_query = list(self.sparse_model.embed([query]))[0]
+        sparse_query_vector = models.SparseVector(indices=sparse_query.indices,
+                                                  values=sparse_query.values)
+        user_filter = models.Filter(must=[models.FieldCondition(key="user_id" , match=models.MatchValue(value=user_id))])
+        results = self.client.query_points(collection_name=self.collection_name,
+                                           prefetch=[models.Prefetch(
+                                               query=dense_query_vector,
+                                               limit=20,
+                                               using='dense',
+                                               filter=user_filter
+                                           ),
+                                           models.Prefetch(
+                                               query=sparse_query_vector,
+                                               using='sparse',
+                                               limit=20,
+                                               filter=user_filter
+                                           )],
+                                           query=models.FusionQuery(fusion=models.Fusion.RRF),
+                                           limit=20)
+        texts = [point.payload.get('text' , '') for point in results.points]
+        rerank_scores = self.cloud_rerank(query, texts)
+        reranked_results = []
+        for point, score in zip(results.points, rerank_scores):
+            reranked_results.append({
+                "text": point.payload.get("text"),
+                "source": point.payload.get("source"),
+                "pages": point.payload.get("pages"),
+                "section": point.payload.get("section"),
+                "original_qdrant_score": point.score,
+                "rerank_score": float(score)
+            })
+        reranked_results.sort(key=lambda x: x["rerank_score"], reverse=True)
+        return reranked_results[:5]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff