LightRT commited on
Commit
9cc7f8d
·
0 Parent(s):

Initial completely clean deployment

Browse files
.dockerignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Virtual Environments
2
+ .venv/
3
+ venv/
4
+ env/
5
+
6
+ # Python Cache
7
+ __pycache__/
8
+ *.pyc
9
+ *.pyo
10
+
11
+ # Git
12
+ .git/
13
+
14
+ # Ignore local database files if Qdrant creates any locally
15
+ qdrant_storage/
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+
3
+ .venv/
4
+ venv/
5
+ env/
6
+
7
+ __pycache__/
8
+ *.pyc
9
+
10
+ data/uploads/
11
+
12
+ .DS_Store
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
README.md ADDED
File without changes
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import uuid
4
+
5
+ # 1. PAGE CONFIGURATION
6
+ st.set_page_config(page_title="Enterprise RAG Assistant", page_icon="🤖", layout="centered")
7
+ st.title("📚 Enterprise Document Assistant")
8
+ st.markdown("Upload a PDF to the knowledge base and ask questions about it.")
9
+
10
+ # 2. SESSION STATE INITIALIZATION (The Memory Bank)
11
+ if "user_id" not in st.session_state:
12
+ st.session_state.user_id = str(uuid.uuid4())
13
+
14
+ if "thread_id" not in st.session_state:
15
+ st.session_state.thread_id = str(uuid.uuid4())
16
+
17
+ if "messages" not in st.session_state:
18
+ st.session_state.messages = []
19
+
20
+ # 3. SIDEBAR: PDF UPLOAD (The Handoff to FastAPI)
21
+ with st.sidebar:
22
+ st.header("Document Ingestion")
23
+ uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
24
+
25
+ if st.button("Process Document"):
26
+ if uploaded_file:
27
+ with st.spinner("Transmitting to backend..."):
28
+ # Package the file as multipart/form-data
29
+ files = {"file": (uploaded_file.name, uploaded_file.getvalue(), "application/pdf")}
30
+
31
+ payload_data = {"user_id": st.session_state.user_id}
32
+
33
+ # Send the POST request to your local FastAPI server
34
+ try:
35
+ response = requests.post(
36
+ "http://backend:8000/upload",
37
+ files=files,
38
+ data=payload_data
39
+ )
40
+ if response.status_code == 200:
41
+ st.success("File uploaded! The AI is reading it in the background.")
42
+ else:
43
+ st.error(f"Upload failed: {response.text}")
44
+ except requests.exceptions.ConnectionError:
45
+ st.error("Cannot connect to backend. Is FastAPI running?")
46
+ else:
47
+ st.warning("Please select a file first.")
48
+
49
+ # 4. CHAT HISTORY RENDERING
50
+ for msg in st.session_state.messages:
51
+ # This creates a chat bubble. role is either 'user' or 'assistant'
52
+ with st.chat_message(msg["role"]):
53
+ st.markdown(msg["content"])
54
+
55
+ # 5. CHAT INPUT & BACKEND COMMUNICATION
56
+ if prompt := st.chat_input("Ask a question about your documents..."):
57
+
58
+ # Immediately render the user's new message to the UI
59
+ st.session_state.messages.append({"role": "user", "content": prompt})
60
+ with st.chat_message("user"):
61
+ st.markdown(prompt)
62
+
63
+ # Show a loading indicator while we wait for FastAPI and LangGraph
64
+ with st.chat_message("assistant"):
65
+ message_placeholder = st.empty()
66
+ message_placeholder.markdown("*(Thinking...)*")
67
+
68
+ # Prepare the JSON payload for FastAPI
69
+ payload = {
70
+ "message": prompt,
71
+ "user_id": st.session_state.user_id,
72
+ "thread_id": st.session_state.thread_id
73
+ }
74
+
75
+ try:
76
+ # Send the question to your LangGraph backend
77
+ chat_response = requests.post("http://backend:8000/chat", json=payload)
78
+
79
+ if chat_response.status_code == 200:
80
+ # Extract the answer from the JSON response
81
+ answer = chat_response.json().get("response", "No response found.")
82
+
83
+ # Update the UI placeholder with the actual answer
84
+ message_placeholder.markdown(answer)
85
+
86
+ # Save the AI's answer to the session state memory
87
+ st.session_state.messages.append({"role": "assistant", "content": answer})
88
+ else:
89
+ message_placeholder.error(f"Error: {chat_response.text}")
90
+
91
+ except requests.exceptions.ConnectionError:
92
+ message_placeholder.error("Cannot connect to backend. Is FastAPI running?")
backend.Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official, lightweight Python image
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory inside the container
5
+ WORKDIR /app
6
+
7
+ # Copy the requirements file and install dependencies
8
+ # (We use standard pip inside the container because it's universally stable)
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy all your project files into the container
13
+ COPY . .
14
+
15
+ # Expose the port FastAPI runs on
16
+ # Change this
17
+ EXPOSE 7860
18
+
19
+ # And change this
20
+ CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]
docker-compose.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ backend:
5
+ build:
6
+ context: .
7
+ dockerfile: backend.Dockerfile
8
+ ports:
9
+ - "8000:8000"
10
+ env_file:
11
+ - .env
12
+ # This prevents the container from crashing immediately if it hits a tiny error
13
+ restart: always
14
+
15
+ frontend:
16
+ build:
17
+ context: .
18
+ dockerfile: frontend.Dockerfile
19
+ ports:
20
+ - "8501:8501"
21
+ env_file:
22
+ - .env
23
+ # Tells Docker to start the backend BEFORE it starts the frontend
24
+ depends_on:
25
+ - backend
26
+ restart: always
frontend.Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ # Expose the port Streamlit runs on
11
+ EXPOSE 8501
12
+
13
+ # The command to start the UI
14
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from pdf-qa-chatbot!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "pdf-qa-chatbot"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "docling>=2.96.1",
9
+ "fastapi>=0.136.3",
10
+ "fastembed>=0.8.0",
11
+ "langchain>=1.3.2",
12
+ "langchain-community>=0.4.2",
13
+ "langchain-core>=1.4.0",
14
+ "langchain-openai>=1.2.2",
15
+ "langgraph>=1.2.2",
16
+ "langgraph-checkpoint-postgres>=3.1.0",
17
+ "langsmith>=0.8.8",
18
+ "psycopg[binary]>=3.3.4",
19
+ "pydantic>=2.13.4",
20
+ "python-dotenv>=1.2.2",
21
+ "qdrant-client>=1.18.0",
22
+ "streamlit>=1.58.0",
23
+ "tavily>=1.1.0",
24
+ "transformers>=5.9.0",
25
+ "uuid>=1.30",
26
+ ]
requirements.txt ADDED
Binary file (560 Bytes). View file
 
src/__pycache__/embedding.cpython-312.pyc ADDED
Binary file (4.18 kB). View file
 
src/__pycache__/graph.cpython-312.pyc ADDED
Binary file (8.81 kB). View file
 
src/__pycache__/ingestion.cpython-312.pyc ADDED
Binary file (1.91 kB). View file
 
src/__pycache__/main.cpython-312.pyc ADDED
Binary file (4.25 kB). View file
 
src/__pycache__/retrieval.cpython-312.pyc ADDED
Binary file (4.09 kB). View file
 
src/embedding.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.ingestion import ingestion_and_chunking
2
+ from qdrant_client import QdrantClient
3
+ from qdrant_client.models import Distance, VectorParams, SparseVectorParams, PointStruct
4
+ from fastembed import SparseTextEmbedding
5
+ import uuid
6
+ from dotenv import load_dotenv
7
+ import os
8
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
9
+
10
+ load_dotenv()
11
+ qdrant_api_key = os.getenv("QDRANT_API_KEY")
12
+ qdrant_url = os.getenv("QDRANT_URL")
13
+ hf_token = os.getenv("HF_TOKEN")
14
+
15
+ def upload_file(file_path: str, user_id: str, collection_name="pdf_rag_chat"):
16
+
17
+ client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
18
+
19
+ dense_model = HuggingFaceInferenceAPIEmbeddings(
20
+ api_key=hf_token,
21
+ model_name="sentence-transformers/all-MiniLM-L6-v2")
22
+ sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
23
+
24
+ # 1. ONLY the database creation should be inside this IF block
25
+ if not client.collection_exists(collection_name):
26
+ client.create_collection(
27
+ collection_name=collection_name,
28
+ vectors_config={
29
+ "dense": VectorParams(size=384, distance=Distance.COSINE)
30
+ },
31
+ sparse_vectors_config={
32
+ "sparse": SparseVectorParams()
33
+ }
34
+ )
35
+
36
+ # 2. EVERYTHING ELSE MUST BE UN-INDENTED SO IT RUNS EVERY TIME
37
+ try:
38
+ docs = ingestion_and_chunking(file_path)
39
+ texts = [doc.page_content for doc in docs]
40
+
41
+ dense_vectors = dense_model.embed_documents(texts)
42
+ sparse_vectors = list(sparse_model.embed(texts))
43
+
44
+ points = []
45
+ file_id = str(uuid.uuid4())
46
+
47
+ for i, doc in enumerate(docs):
48
+ # 1. Convert numpy array to standard Python list
49
+ dense_vec = dense_vectors[i]
50
+
51
+ # 2. Extract indices and values from FastEmbed's custom object
52
+ sparse_emb = sparse_vectors[i]
53
+ sparse_vec = {
54
+ "indices": sparse_emb.indices.tolist(),
55
+ "values": sparse_emb.values.tolist()
56
+ }
57
+ chunk_id = str(uuid.uuid4())
58
+
59
+ point = PointStruct(
60
+ id=chunk_id, # Reusing the same file_id so all chunks tie back to one file
61
+ vector={
62
+ 'dense': dense_vec,
63
+ 'sparse': sparse_vec
64
+ },
65
+ payload={
66
+ 'user_id': user_id,
67
+ 'file_id': file_id,
68
+ 'text': doc.page_content,
69
+ "source": doc.metadata.get("source"),
70
+ "pages": doc.metadata.get("pages"),
71
+ "section": doc.metadata.get("section")
72
+ }
73
+ )
74
+ points.append(point)
75
+
76
+ # (Optional but safe) Tell Qdrant to index it just in case
77
+ try:
78
+ client.create_payload_index(
79
+ collection_name=collection_name,
80
+ field_name="user_id",
81
+ field_schema="keyword"
82
+ )
83
+ except Exception:
84
+ pass
85
+
86
+ # Send to database
87
+ client.upsert(collection_name=collection_name, points=points)
88
+ except Exception as e:
89
+ print("\n" + "!"*60, flush=True)
90
+ print(f"❌ UPLOAD FAILED SILENTLY IN BACKGROUND:", flush=True)
91
+ print(f"{str(e)}", flush=True)
92
+ print("!"*60 + "\n", flush=True)
src/fix_db.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from qdrant_client import QdrantClient
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ client = QdrantClient(
8
+ url=os.getenv("QDRANT_URL"),
9
+ api_key=os.getenv("QDRANT_API_KEY")
10
+ )
11
+
12
+ # LOOK AT YOUR retrieval.py FILE AND COPY THE EXACT COLLECTION NAME HERE
13
+ COLLECTION_NAME = "pdf_rag"
14
+
15
+ print(f"Attempting to build index for '{COLLECTION_NAME}'...")
16
+
17
+ try:
18
+ client.create_payload_index(
19
+ collection_name=COLLECTION_NAME,
20
+ field_name="user_id",
21
+ field_schema="keyword"
22
+ )
23
+ print("✅ Index built successfully! Qdrant is ready.")
24
+ except Exception as e:
25
+ print(f"❌ FAILED: {e}")
src/graph.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TypedDict , Annotated , List
2
+ from langgraph.graph.message import add_messages
3
+ from langchain_core.messages import SystemMessage , HumanMessage
4
+ from langchain_openai import ChatOpenAI
5
+ import os
6
+ from src.retrieval import Retriever
7
+ import os
8
+ from tavily import TavilyClient
9
+ from dotenv import load_dotenv
10
+ from langgraph.graph import StateGraph, START ,END
11
+ from langgraph.checkpoint.postgres import PostgresSaver
12
+ from psycopg_pool import ConnectionPool
13
+
14
+ load_dotenv()
15
+
16
+ class State(TypedDict) :
17
+ messages : Annotated[list , add_messages]
18
+ context : List[dict]
19
+ rewritten_query : str
20
+ user_id : str
21
+ web_search_needed : bool
22
+ retry : int
23
+
24
+ llm = ChatOpenAI(
25
+ model="openai/gpt-4o-mini",
26
+ openai_api_key=os.getenv("OPENROUTER_API_KEY"),
27
+ openai_api_base="https://openrouter.ai/api/v1",
28
+ temperature=0
29
+ )
30
+
31
+ retriever = Retriever()
32
+
33
+ tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
34
+
35
+ def rewrite_node(state : State) :
36
+ messages = state['messages']
37
+
38
+ # 1. Filter to only get the human's messages
39
+ user_msg = [msg for msg in messages if isinstance(msg , HumanMessage)]
40
+
41
+ # 2. Extract the actual text
42
+ latest_ques = user_msg[-1].content
43
+ history = "\n".join([msg.content for msg in user_msg[:-1]])
44
+
45
+ # 3. Set the strict system rules
46
+ system_prompt = SystemMessage(content="""You are an expert search query generator for a vector database.
47
+ Your ONLY job is to convert the user's latest input into a single, highly optimized search string.
48
+
49
+ You will receive a sequence of the user's previous questions, followed by their newest input.
50
+
51
+ CRITICAL RULES:
52
+ 1. TRACK THE TRAIN OF THOUGHT: If the latest input uses pronouns (it, they, this) or is a fragment (e.g., "What about the budget?"), identify the core noun from the previous questions and substitute it.
53
+ 2. NO CONVERSATIONAL FILLER: Do not answer the question. Do not explain your reasoning.
54
+ 3. FORMAT: Output only the raw search keywords. No commas, no bullet points.
55
+
56
+ Example Input:
57
+ Chat History:
58
+ What is the main objective of Project Chronos?
59
+ Who is the lead engineer?
60
+ Latest User Input: What is his total budget for Q4?
61
+
62
+ Example Output: Project Chronos lead engineer budget
63
+ """)
64
+
65
+ # 4. FIX: Package the history and question into a proper HumanMessage object
66
+ human_prompt = HumanMessage(content=f"Chat History: {history}\n\nLatest User Input: {latest_ques}\n\nGenerate the concise search query now:")
67
+
68
+ # 5. FIX: Combine them as a valid list of Message objects
69
+ final_msg = [system_prompt, human_prompt]
70
+
71
+ # 6. Invoke the LLM
72
+ response = llm.invoke(final_msg)
73
+
74
+ print("\n" + "="*60, flush=True)
75
+ print(f"\n ReQuery : \n{response.content} \n", flush=True)
76
+ print("="*60 + "\n", flush=True)
77
+
78
+ return {'rewritten_query' : response.content}
79
+
80
+ def retrieve_node(state : State) :
81
+ user_id = state['user_id']
82
+ re_query = state['rewritten_query']
83
+
84
+ context = retriever.retrieve(re_query , user_id)
85
+
86
+ return{'context' : context}
87
+
88
+ def answer_node(state : State) :
89
+ messages = state['messages']
90
+ context = state['context']
91
+ retry = state.get('retry' , 0)
92
+
93
+ context_text = ""
94
+ if not context:
95
+ context_text = "No relevant context found in the database for this specific query."
96
+ else:
97
+ for i, chunk in enumerate(context):
98
+ context_text += f"\n--- Document Chunk {i+1} ---\n"
99
+ context_text += f"Source: {chunk.get('source', 'Unknown')}\n"
100
+ context_text += f"Pages: {chunk.get('pages', 'N/A')}\n"
101
+ context_text += f"Section: {chunk.get('section', 'N/A')}\n"
102
+ context_text += f"Content: {chunk.get('text', '')}\n"
103
+
104
+ print("\n" + "="*60, flush=True)
105
+ print(f"\n\nCONTEXT TEXT :/n/n{context_text}", flush=True)
106
+ print("="*60 + "\n", flush=True)
107
+
108
+ if retry<1 :
109
+ system_prompt = SystemMessage(content=f"""
110
+ You are an advanced enterprise RAG assistant. Your job is to answer the user's latest question
111
+ by strictly analyzing the conversation history and the provided document chunks below.
112
+
113
+ CRITICAL RULES:
114
+ 1. Base your answer ONLY on the text snippets provided in the Context section below. Do not assume or extrapolate.
115
+ 2. If the context does not contain the answer, or if the context is irrelevant to the question,
116
+ you must reply with exactly this phrase and absolutely nothing else: FALLBACK_TO_WEB_SEARCH
117
+ 3. You MUST inline cite your sources whenever you use information from a chunk.
118
+ Format your citations cleanly at the end of sentences like this: [Source: file.pdf, Page: X].
119
+
120
+ CONTEXT DATA:
121
+ {context_text}
122
+ """)
123
+ else :
124
+ system_prompt = f"""
125
+ You are an advanced enterprise RAG assistant. Your job is to answer the user's latest question
126
+ by strictly analyzing the conversation history and the provided document chunks below.
127
+ These chunks now include both internal documents and live web search results.
128
+
129
+ CRITICAL RULES:
130
+ 1. Base your answer ONLY on the text snippets provided in the Context section below. Do not assume or extrapolate.
131
+ 2. DO NOT ask for another web search. If the answer is still not found in the provided context, you must politely inform the user that the information is unavailable.
132
+ 3. You MUST inline cite your sources whenever you use information from a chunk.
133
+ Format your citations cleanly at the end of sentences like this: [Source: file.pdf, Page: X] or [Source: website_url].
134
+
135
+ CONTEXT DATA:
136
+ {context_text}
137
+ """
138
+
139
+ final_msg = [system_prompt] + messages
140
+
141
+ response = llm.invoke(final_msg)
142
+
143
+ if response.content.strip() == "FALLBACK_TO_WEB_SEARCH":
144
+ return {"web_search_needed": True}
145
+ else:
146
+ return {"messages": [response],
147
+ "web_search_needed": False}
148
+
149
+ def routing(state : State) :
150
+ if state["web_search_needed"] :
151
+ return "web_search_node"
152
+ else:
153
+ return "END"
154
+
155
+ def web_search_node(state : State) :
156
+ re_query = state['rewritten_query']
157
+ context = state['context']
158
+ retry = state.get('retry' , 0)
159
+
160
+ response = tavily_client.search(query=re_query , max_results=3)
161
+ results = response['results']
162
+
163
+ web_context = []
164
+
165
+ for res in results :
166
+ web_context.append({
167
+ "text": res.get("content", ""),
168
+ "source": res.get("url", "Live Web Search"),
169
+ "pages": "N/A",
170
+ "section": "Internet Result"
171
+ })
172
+
173
+ combined = context + web_context
174
+
175
+ return {'context' : combined , 'retry' : retry+1}
176
+
177
+ workflow = StateGraph(State)
178
+
179
+ workflow.add_node("rewrite_node" , rewrite_node)
180
+ workflow.add_node("retrieve_node" , retrieve_node)
181
+ workflow.add_node("answer_node" , answer_node)
182
+ workflow.add_node("web_search_node" , web_search_node)
183
+
184
+ workflow.add_edge(START , "rewrite_node")
185
+ workflow.add_edge("rewrite_node" , "retrieve_node")
186
+ workflow.add_edge("retrieve_node" , "answer_node")
187
+ workflow.add_conditional_edges(
188
+ "answer_node",
189
+ routing,
190
+ {"web_search_node": "web_search_node",
191
+ "END": END})
192
+ workflow.add_edge("web_search_node" , "answer_node")
193
+
src/ingestion.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docling.document_converter import DocumentConverter
2
+ from docling.chunking import HybridChunker
3
+ from transformers import AutoTokenizer
4
+ from langchain_core.documents import Document
5
+ from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
6
+
7
+
8
+ def ingestion_and_chunking(file_path : str) :
9
+
10
+ converter = DocumentConverter()
11
+ result = converter.convert(file_path)
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
14
+
15
+ chunker = HybridChunker(merge_peers=True ,
16
+ chunk_size=800 ,
17
+ overlap=200,
18
+ tokenizer=tokenizer )
19
+
20
+ chunks = list(chunker.chunk(result.document))
21
+
22
+ for chunk in chunks :
23
+ chunk.text = chunker.contextualize(chunk)
24
+
25
+
26
+ docs = []
27
+
28
+ for chunk in chunks:
29
+ pages = sorted({
30
+ prov.page_no
31
+ for item in chunk.meta.doc_items
32
+ for prov in item.prov
33
+ })
34
+
35
+ docs.append(
36
+ Document(
37
+ page_content=chunk.text,
38
+ metadata={
39
+ "source": chunk.meta.origin.filename,
40
+ "pages": pages,
41
+ "section": chunk.meta.headings[0] if chunk.meta.headings else None,
42
+ }
43
+ )
44
+ )
45
+
46
+ return docs
src/main.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI , HTTPException , UploadFile, File, BackgroundTasks , Form
2
+ from pydantic import BaseModel , Field
3
+ import os
4
+ from dotenv import load_dotenv
5
+ from src.graph import workflow
6
+ from src.embedding import upload_file
7
+ import shutil
8
+ from langgraph.checkpoint.postgres import PostgresSaver
9
+ from psycopg_pool import ConnectionPool
10
+
11
+ load_dotenv()
12
+
13
+ app = FastAPI(
14
+ title="Enterprise PDF RAG API",
15
+ description="A production-grade backend powering an intelligent LangGraph agent.",
16
+ version="1.0.0"
17
+ )
18
+
19
+ class ChatRequest(BaseModel):
20
+ message: str = Field(..., description="The raw message string from the user.")
21
+ user_id: str = Field(..., description="The unique identifier for the tenant context.")
22
+ thread_id: str = Field(..., description="The unique session ID tracking the short-term chat history.")
23
+
24
+ @app.post("/chat", summary="Return an answer using the RAG backend to the user query.")
25
+ async def chat_endpoint(request: ChatRequest):
26
+ try:
27
+ config = {'configurable': {'thread_id': request.thread_id}}
28
+ initial_state = {
29
+ "messages": [("user", request.message)],
30
+ "user_id": request.user_id
31
+ }
32
+
33
+ # 1. Grab the database URL
34
+ db_uri = os.getenv("DATABASE_URI")
35
+
36
+ # 2. Open a fresh, guaranteed-alive connection to Postgres
37
+ with PostgresSaver.from_conn_string(db_uri) as checkpointer:
38
+
39
+ # (Optional) Ensure tables exist
40
+ checkpointer.setup()
41
+
42
+ # 3. Compile the LangGraph blueprint with our fresh memory connection
43
+ agent = workflow.compile(checkpointer=checkpointer)
44
+
45
+ # 4. Run the AI pipeline
46
+ result = agent.invoke(initial_state, config=config)
47
+
48
+ # 5. Extract the AI's final answer
49
+ output_messages = result.get("messages", [])
50
+ if not output_messages:
51
+ raise ValueError("No messages returned from the graph.")
52
+
53
+ ai_response = output_messages[-1].content
54
+
55
+ return {
56
+ "status": "success",
57
+ "thread_id": request.thread_id,
58
+ "response": ai_response
59
+ }
60
+
61
+ except Exception as e:
62
+ print(f"Backend Error: {str(e)}")
63
+ raise HTTPException(status_code=500, detail=f"Agent Processing Error: {str(e)}")
64
+
65
+ UPLOAD_DIR = "data/uploads"
66
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
67
+
68
+ @app.post("/upload", summary="Upload a PDF and process its embeddings in the background")
69
+ async def upload_pdf(
70
+ background_tasks: BackgroundTasks,
71
+ file: UploadFile = File(...),
72
+ user_id : str = Form(...)
73
+ ):
74
+ local_file_path = os.path.join(UPLOAD_DIR, file.filename)
75
+
76
+ with open(local_file_path, "wb") as buffer:
77
+ shutil.copyfileobj(file.file, buffer)
78
+
79
+ background_tasks.add_task(upload_file, local_file_path, user_id)
80
+
81
+ return {
82
+ "status": "success",
83
+ "message": f"'{file.filename}' received successfully. Ingestion pipeline started in the background."
84
+ }
src/retrieval.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+ from qdrant_client import QdrantClient
5
+ from qdrant_client import models
6
+ from fastembed import SparseTextEmbedding
7
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
8
+
9
+ load_dotenv()
10
+
11
+ qdrant_api_key = os.getenv("QDRANT_API_KEY")
12
+ qdrant_url = os.getenv("QDRANT_URL")
13
+ hf_token = os.getenv("HF_TOKEN")
14
+
15
+ class Retriever() :
16
+ def __init__(self , collection_name = 'pdf_rag_v3') :
17
+ self.collection_name = collection_name
18
+ self.client = QdrantClient(url=qdrant_url , api_key=qdrant_api_key)
19
+
20
+ # 🚨 THE FIX: Do NOT load models here. Let the server boot fast and light.
21
+ self.dense_model = None
22
+ self.sparse_model = None
23
+
24
+ def cloud_rerank(self, query, texts):
25
+ API_URL = "https://api-inference.huggingface.co/models/cross-encoder/ms-marco-MiniLM-L-6-v2"
26
+ headers = {"Authorization": f"Bearer {hf_token}"}
27
+ payload = {
28
+ "inputs": {
29
+ "source_sentence": query,
30
+ "sentences": texts
31
+ }
32
+ }
33
+ try:
34
+ response = requests.post(API_URL, headers=headers, json=payload)
35
+ if response.status_code == 200:
36
+ return response.json()
37
+ except Exception as e:
38
+ print(f"Cloud reranker failed: {e}")
39
+ pass
40
+
41
+ return [0.0] * len(texts)
42
+
43
+
44
+ def retrieve(self , query : str , user_id : str) :
45
+ # 🚨 THE FIX: Lazy Load. Only turn the models on the very first time someone asks a question!
46
+ if self.dense_model is None:
47
+ self.dense_model = HuggingFaceInferenceAPIEmbeddings(
48
+ api_key=hf_token,
49
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
50
+ )
51
+ if self.sparse_model is None:
52
+ self.sparse_model = SparseTextEmbedding(model_name="Qdrant/bm25")
53
+
54
+ dense_query_vector = self.dense_model.embed_query(query)
55
+
56
+ sparse_query = list(self.sparse_model.embed([query]))[0]
57
+ sparse_query_vector = models.SparseVector(indices=sparse_query.indices,
58
+ values=sparse_query.values)
59
+
60
+ user_filter = models.Filter(must=[models.FieldCondition(key="user_id" , match=models.MatchValue(value=user_id))])
61
+
62
+ results = self.client.query_points(collection_name=self.collection_name,
63
+ prefetch=[models.Prefetch(
64
+ query=dense_query_vector,
65
+ limit=20,
66
+ using='dense',
67
+ filter=user_filter
68
+ ),
69
+ models.Prefetch(
70
+ query=sparse_query_vector,
71
+ using='sparse',
72
+ limit=20,
73
+ filter=user_filter
74
+ )],
75
+ query=models.FusionQuery(fusion=models.Fusion.RRF),
76
+ limit=20)
77
+
78
+ texts = [point.payload.get('text' , '') for point in results.points]
79
+
80
+ rerank_scores = self.cloud_rerank(query, texts)
81
+
82
+ reranked_results = []
83
+ for point, score in zip(results.points, rerank_scores):
84
+ reranked_results.append({
85
+ "text": point.payload.get("text"),
86
+ "source": point.payload.get("source"),
87
+ "pages": point.payload.get("pages"),
88
+ "section": point.payload.get("section"),
89
+ "original_qdrant_score": point.score,
90
+ "rerank_score": float(score)
91
+ })
92
+
93
+ reranked_results.sort(key=lambda x: x["rerank_score"], reverse=True)
94
+
95
+ return reranked_results[:5]
uv.lock ADDED
The diff for this file is too large to render. See raw diff