dembasowmr commited on
Commit
414dfd0
·
0 Parent(s):

Add FastAPI app and Dockerfile

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ documents/*.pdf filter=lfs diff=lfs merge=lfs -text
2
+ chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .venv
2
+ all-libraries.txt
3
+
4
+ .env.local
5
+ .env*
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+
3
+ # Use a base image with Python installed. Python 3.10 or 3.11 are good choices.
4
+ # We choose a Debian-based image for compatibility with apt_packages.
5
+ FROM python:3.10-slim-buster
6
+
7
+ # Set the working directory inside the container
8
+ WORKDIR /app
9
+
10
+ # Install system dependencies needed for pdf2image (Poppler) and pytesseract (Tesseract)
11
+ # These correspond to the apt_packages listed in your requirements.txt comments.
12
+ RUN apt-get update && apt-get install -y \
13
+ libpoppler-dev \
14
+ tesseract-ocr \
15
+ tesseract-ocr-eng \
16
+ tesseract-ocr-tur \
17
+ # Add other languages if needed, e.g., tesseract-ocr-all for all languages
18
+ # If you remove apt_packages from requirements.txt, ensure these are here.
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ # Copy requirements.txt and install Python dependencies
22
+ COPY requirements.txt .
23
+ RUN pip install --no-cache-dir -r requirements.txt
24
+
25
+ # Copy the rest of your application code
26
+ # This copies app.py, the src/ folder, and the documents/ folder.
27
+ COPY . .
28
+
29
+ # Set the environment variable for Tesseract if it's not in the default path
30
+ # This might be needed if Tesseract's executable isn't directly on PATH inside the container.
31
+ # Tesseract is often installed to /usr/bin/tesseract or similar in Linux containers.
32
+ # It's good practice to explicitly tell pytesseract where to find it.
33
+ ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
34
+ ENV TESSERACT_CMD=/usr/bin/tesseract
35
+
36
+ # Command to run your FastAPI application using Uvicorn
37
+ # 0.0.0.0 makes it accessible from outside the container
38
+ # 7860 is the default port Hugging Face Spaces expects for web applications
39
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
File without changes
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import os
4
+ from fastapi import FastAPI, HTTPException
5
+ from pydantic import BaseModel
6
+ import uvicorn
7
+ import sys
8
+
9
+ # Add src to the Python path so we can import modules from it
10
+ # This is crucial for deployment environments where 'src' might not be automatically recognized
11
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
12
+
13
+ # Import your DocumentRAG class and other necessary components from your backend script
14
+ # Make sure your rag_backend.py has `embedding_model` defined globally or passed correctly
15
+ from compassia import DocumentRAG, embedding_model, pdf_document_paths, extract_text_from_pdf, ocr_pdf, chunk_text # Import all needed functions/variables
16
+
17
+ # --- Initialize the RAG system globally ---
18
+ # This ensures the model loads and indexing happens once when the FastAPI app starts
19
+ # and persists across requests within the same process.
20
+ # ChromaDB will save its data to the './chroma_db' directory within the Space.
21
+ print("--- FastAPI App Startup: Initializing RAG System ---")
22
+ rag_system = DocumentRAG(
23
+ embedding_model=embedding_model,
24
+ persist_directory="./chroma_db", # ChromaDB will store data here in the Space
25
+ collection_name="pdf_documents_collection",
26
+ chunk_size=700, # Match your existing chunk size
27
+ overlap=100 # Match your existing overlap
28
+ )
29
+
30
+ # --- Index documents on startup ---
31
+ # This loop will run when the FastAPI app first starts.
32
+ # It uses ChromaDB's persistence, so documents already indexed will be skipped.
33
+ print("--- FastAPI App Startup: Indexing Documents (ChromaDB persistence) ---")
34
+ for pdf_path in pdf_document_paths:
35
+ # Ensure the path is correct relative to the Space's filesystem
36
+ full_pdf_path = os.path.join(os.path.dirname(__file__), pdf_path)
37
+ if os.path.exists(full_pdf_path):
38
+ rag_system.add_document(full_pdf_path) # Call add_document on rag_system
39
+ else:
40
+ print(f"API Error: PDF file not found at {full_pdf_path}. Ensure it's deployed with your app in the 'documents' folder.")
41
+ print("--- FastAPI App Startup: Document indexing complete ---")
42
+
43
+
44
+ # --- FastAPI Application Instance ---
45
+ app = FastAPI(
46
+ title="Compassia AI PDF Chat API",
47
+ description="Backend API for querying PDFs using DeepSeek (via OpenRouter) and BGE-M3 embeddings.",
48
+ version="0.1.0",
49
+ )
50
+
51
+ # Pydantic model for request body validation
52
+ class QueryRequest(BaseModel):
53
+ question: str
54
+
55
+ # --- API Endpoint Definition ---
56
+ @app.post("/ask-pdf/")
57
+ async def ask_pdf_endpoint(request: QueryRequest):
58
+ """
59
+ Answers a question about the indexed PDF documents using RAG.
60
+ """
61
+ try:
62
+ # Pass an empty list for pdf_paths as documents are already indexed in ChromaDB
63
+ answer = rag_system.answer_question(request.question, [])
64
+ return {"answer": answer}
65
+ except Exception as e:
66
+ print(f"Error processing /ask-pdf/ request: {e}")
67
+ raise HTTPException(status_code=500, detail=str(e))
68
+
69
+ # Basic health check endpoint
70
+ @app.get("/")
71
+ async def root():
72
+ return {"message": "Compassia AI PDF Chat API is running. Use /ask-pdf/ for queries."}
73
+
74
+ # You can run this locally for testing:
75
+ # if __name__ == "__main__":
76
+ # # This part runs locally if you execute app.py directly
77
+ # # For deployment, uvicorn is typically run via a command line.
78
+ # uvicorn.run(app, host="0.0.0.0", port=8000)
chroma_db/70620ac4-f65d-41e9-9ace-b207c7fe8546/header.bin ADDED
Binary file (100 Bytes). View file
 
chroma_db/70620ac4-f65d-41e9-9ace-b207c7fe8546/length.bin ADDED
Binary file (40 kB). View file
 
chroma_db/70620ac4-f65d-41e9-9ace-b207c7fe8546/link_lists.bin ADDED
File without changes
chroma_db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91452e6331387e73b3f0da0f1370e2824a647883d3970cb2c014200504189419
3
+ size 4104192
compassia.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import os
3
+ import io
4
+ import re
5
+ import uuid # For generating unique IDs for ChromaDB
6
+ from PIL import Image
7
+
8
+ # For text extraction from PDFs (non-OCR)
9
+ from pdfminer.high_level import extract_text_to_fp
10
+ from pdfminer.layout import LAParams
11
+
12
+ # For image-based PDFs (OCR)
13
+ from pdf2image import convert_from_path
14
+ import pytesseract
15
+
16
+ # For embeddings and vector search
17
+ from FlagEmbedding import BGEM3FlagModel # Using BGEM3FlagModel directly as per your latest code
18
+ import chromadb # pip install chromadb
19
+
20
+ # --- IMPORTANT: Configure Paths for Tesseract and Poppler ---
21
+ # If Tesseract is not in your system's PATH, uncomment and set this:
22
+ # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
23
+
24
+ # If pdf2image gives errors about poppler, uncomment and set this:
25
+ # poppler_path = r'C:\path\to\poppler\bin'
26
+
27
+ # --- OpenRouter DeepSeek API Configuration ---
28
+ API_KEY = os.getenv("DEEPSEEK_R1_V3_API_KEY")
29
+ if API_KEY:
30
+ API_KEY = API_KEY.strip()
31
+
32
+ if not API_KEY:
33
+ raise ValueError("API key is not set. Please set the DEEPSEEK_R1_V3_API_KEY environment variable with your OpenRouter key.")
34
+
35
+ API_URL = 'https://openrouter.ai/api/v1/chat/completions'
36
+ HEADERS = {
37
+ 'Authorization': f'Bearer {API_KEY}',
38
+ 'Content-Type': 'application/json'
39
+ }
40
+
41
+ # --- Embedding Model Configuration (Local BGE-M3) ---
42
+ # IMPORTANT: This assumes you've run 'pip install -U FlagEmbedding'
43
+ # BGE-M3 is multilingual, which is good for Turkish PDFs.
44
+ # You might need to download the model weights the first time it's initialized.
45
+ # Ensure you have enough RAM/VRAM for the model.
46
+ print("Loading FlagEmbedding (BGE-M3) model...")
47
+ try:
48
+ # Initialize BGEM3FlagModel. It will download weights to Hugging Face cache
49
+ # the first time, hence the need for disk space.
50
+ embedding_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
51
+ print("FlagEmbedding (BGE-M3) model loaded successfully.")
52
+ except Exception as e:
53
+ print(f"Error loading FlagEmbedding model: {e}")
54
+ print("Ensure you have resolved disk space issues for model download and have enough memory.")
55
+ print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
56
+ exit(1) # Exit if embedding model fails to load
57
+
58
+ # --- PDF Processing Functions ---
59
+
60
+ def extract_text_from_pdf(pdf_path: str) -> str:
61
+ """
62
+ Extracts text from a PDF. Tries direct text extraction first.
63
+ If sparse text is found (suggesting image-based PDF), it performs OCR.
64
+ """
65
+ print(f"Attempting direct text extraction from: {pdf_path}")
66
+ output_string = io.StringIO()
67
+ with open(pdf_path, 'rb') as fp:
68
+ try:
69
+ # Use LAParams for better layout analysis
70
+ extract_text_to_fp(fp, output_string, laparams=LAParams())
71
+ text = output_string.getvalue()
72
+ # Basic check: if text is very short for a non-empty PDF, it might be image-based
73
+ if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000: # Check file size as well
74
+ print("Direct extraction yielded sparse text. Attempting OCR...")
75
+ return ocr_pdf(pdf_path)
76
+ return text
77
+ except Exception as e:
78
+ print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
79
+ return ocr_pdf(pdf_path)
80
+
81
+ def ocr_pdf(pdf_path: str) -> str:
82
+ """
83
+ Performs OCR on a PDF file using pdf2image and pytesseract.
84
+ Requires Tesseract and Poppler to be installed and in system PATH.
85
+ """
86
+ all_text = []
87
+ try:
88
+ # Convert PDF pages to images. Higher DPI for better OCR.
89
+ # Pass poppler_path=poppler_path if it's not in your system's PATH
90
+ images = convert_from_path(pdf_path, dpi=300) # You can adjust dpi for quality vs. speed
91
+
92
+ print(f" Performing OCR on {len(images)} pages...")
93
+ for i, img in enumerate(images):
94
+ # Optional: Basic image preprocessing for better OCR
95
+ # img = img.convert('L') # Convert to grayscale
96
+ # img = img.point(lambda x: 0 if x < 128 else 255, '1') # Binarize
97
+
98
+ # Perform OCR (lang='eng+tur' for English and Turkish support)
99
+ page_text = pytesseract.image_to_string(img, lang='eng+tur')
100
+ all_text.append(page_text)
101
+ print(f" Page {i+1} OCR complete.")
102
+
103
+ except Exception as e:
104
+ print(f"OCR process failed: {e}")
105
+ print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
106
+ return "" # Return empty string if OCR fails
107
+
108
+ return "\n".join(all_text)
109
+
110
+ def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
111
+ """
112
+ Splits text into chunks of a maximum size with optional overlap.
113
+ Aims to split by paragraphs/sentences first, then by word.
114
+ Note: Increased max_chunk_size to 700 to match your previous code's `chunk_size` for RAG.
115
+ """
116
+ if not text:
117
+ return []
118
+
119
+ # Simple paragraph-based chunking
120
+ paragraphs = re.split(r'\n\s*\n', text)
121
+ chunks = []
122
+ current_chunk = []
123
+ current_chunk_len = 0
124
+
125
+ for para in paragraphs:
126
+ if not para.strip():
127
+ continue
128
+
129
+ # If adding paragraph plus a separator exceeds max_chunk_size,
130
+ # or if the current_chunk is already substantial and adding this makes it too big,
131
+ # then finalize the current chunk.
132
+ if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
133
+ if current_chunk: # Only append if current_chunk is not empty
134
+ chunks.append("\n\n".join(current_chunk))
135
+ current_chunk = []
136
+ current_chunk_len = 0
137
+
138
+ # If a single paragraph is larger than max_chunk_size, split it by words
139
+ if len(para) > max_chunk_size:
140
+ words = para.split(' ')
141
+ sub_chunk = []
142
+ sub_chunk_len = 0
143
+ for word in words:
144
+ if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
145
+ chunks.append(" ".join(sub_chunk))
146
+ sub_chunk = [word]
147
+ sub_chunk_len = len(word)
148
+ else:
149
+ sub_chunk.append(word)
150
+ sub_chunk_len += len(word) + len(' ')
151
+ if sub_chunk: # Add remaining sub-chunk
152
+ chunks.append(" ".join(sub_chunk))
153
+ else: # Paragraph fits into a new chunk
154
+ current_chunk.append(para)
155
+ current_chunk_len += len(para) + len('\n\n')
156
+ else: # Paragraph fits into the current chunk
157
+ current_chunk.append(para)
158
+ current_chunk_len += len(para) + len('\n\n')
159
+
160
+ if current_chunk: # Add any remaining text
161
+ chunks.append("\n\n".join(current_chunk))
162
+
163
+ # Apply overlap: This is a simplistic overlap implementation.
164
+ # For more robust RAG, consider sentence-window retrieval or more advanced chunking libraries.
165
+ final_chunks_with_overlap = []
166
+ for i in range(len(chunks)):
167
+ chunk = chunks[i]
168
+ if i > 0 and overlap > 0:
169
+ # Take a portion of the previous chunk to overlap
170
+ prev_chunk_part = chunks[i-1][-overlap:]
171
+ chunk = prev_chunk_part + "\n" + chunk
172
+ final_chunks_with_overlap.append(chunk)
173
+
174
+ return final_chunks_with_overlap
175
+
176
+
177
+ # --- RAG Core Functions with ChromaDB ---
178
+
179
+ class DocumentRAG:
180
+ def __init__(self, embedding_model, persist_directory="./chroma_db", collection_name="pdf_docs", chunk_size=700, overlap=100):
181
+ self.embedding_model = embedding_model
182
+ self.chunk_size = chunk_size
183
+ self.overlap = overlap
184
+ self.persist_directory = persist_directory
185
+ self.collection_name = collection_name
186
+
187
+ # Initialize ChromaDB client and collection
188
+ print(f"Initializing ChromaDB at: {self.persist_directory}")
189
+ self.client = chromadb.PersistentClient(path=self.persist_directory)
190
+
191
+ # Get or create the collection
192
+ self.collection = self.client.get_or_create_collection(
193
+ name=self.collection_name,
194
+ # Genkit uses 'cosine' by default. 'l2' (Euclidean) or 'ip' (Inner Product)
195
+ # are also common. BGE-M3 generally uses cosine.
196
+ metadata={"hnsw:space": "cosine"}
197
+ )
198
+ print(f"ChromaDB collection '{self.collection_name}' ready.")
199
+
200
+ def _generate_chunk_id(self, pdf_path: str, chunk_idx: int) -> str:
201
+ """Generates a unique ID for each chunk based on file path and index."""
202
+ # Use UUID to ensure uniqueness even if paths are similar or contain problematic chars
203
+ return f"{os.path.basename(pdf_path)}_{chunk_idx}_{uuid.uuid4().hex}"
204
+
205
+ def add_document(self, pdf_path: str):
206
+ print(f"Adding document: {pdf_path}")
207
+
208
+ # Check if the document has already been indexed in ChromaDB
209
+ # We'll use the file path as a simple way to check if _any_ chunk from this PDF exists.
210
+ # A more robust check might involve hashing the file content or checking specific metadata.
211
+ results = self.collection.get(
212
+ where={"source": pdf_path},
213
+ limit=1
214
+ )
215
+ if results and results['ids']:
216
+ print(f" Document '{pdf_path}' already in ChromaDB. Skipping re-indexing.")
217
+ return
218
+
219
+ extracted_text = extract_text_from_pdf(pdf_path)
220
+ if not extracted_text:
221
+ print(f"Warning: No text extracted from {pdf_path}. Skipping.")
222
+ return
223
+
224
+ chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
225
+ if not chunks:
226
+ print(f"Warning: No chunks generated for {pdf_path}. Skipping.")
227
+ return
228
+
229
+ # Prepare data for ChromaDB
230
+ documents_to_add = []
231
+ metadatas_to_add = []
232
+ ids_to_add = []
233
+
234
+ print(f" Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB...")
235
+
236
+ # BGE-M3's encode method returns a dictionary for dense, sparse, etc.
237
+ # We need the 'dense_vecs' for standard vector search.
238
+ encoded_results = self.embedding_model.encode(
239
+ chunks,
240
+ batch_size=32, # Adjust batch_size if out of memory
241
+ return_dense=True,
242
+ return_sparse=False,
243
+ return_colbert_vecs=False
244
+ )
245
+
246
+ # Extract only the dense vectors for ChromaDB
247
+ chunk_embeddings = encoded_results["dense_vecs"]
248
+
249
+ # Ensure embeddings are normalized if using cosine similarity with IP index,
250
+ # but ChromaDB's 'cosine' space handles this internally.
251
+ # If using FAISS with IP, you'd normalize here:
252
+ # from numpy.linalg import norm
253
+ # chunk_embeddings = chunk_embeddings / norm(chunk_embeddings, axis=1, keepdims=True)
254
+
255
+
256
+ for i, chunk in enumerate(chunks):
257
+ unique_id = self._generate_chunk_id(pdf_path, i)
258
+ documents_to_add.append(chunk)
259
+ metadatas_to_add.append({"source": pdf_path, "chunk_id": i})
260
+ ids_to_add.append(unique_id)
261
+
262
+ # Add to ChromaDB collection
263
+ self.collection.add(
264
+ documents=documents_to_add,
265
+ embeddings=chunk_embeddings.tolist(), # Convert numpy array to list of lists
266
+ metadatas=metadatas_to_add,
267
+ ids=ids_to_add
268
+ )
269
+
270
+ print(f" {len(documents_to_add)} chunks from '{pdf_path}' added to ChromaDB.")
271
+ print(f" Total chunks in collection: {self.collection.count()}")
272
+
273
+ def retrieve_context(self, query: str, top_k: int = 3) -> list[str]:
274
+ """
275
+ Retrieves top_k most relevant document chunks for a given query from ChromaDB.
276
+ """
277
+ if self.collection.count() == 0:
278
+ print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
279
+ return []
280
+
281
+ print(f"Retrieving context for query: '{query}'")
282
+
283
+ # Encode the query using the embedding model
284
+ query_embedding_result = self.embedding_model.encode(
285
+ [query],
286
+ batch_size=1,
287
+ return_dense=True,
288
+ return_sparse=False,
289
+ return_colbert_vecs=False
290
+ )
291
+ query_embedding = query_embedding_result["dense_vecs"].tolist() # Get dense vector and convert to list
292
+
293
+ # Query ChromaDB
294
+ results = self.collection.query(
295
+ query_embeddings=query_embedding,
296
+ n_results=top_k,
297
+ include=['documents', 'distances', 'metadatas']
298
+ )
299
+
300
+ retrieved_chunks_texts = []
301
+ if results and results['documents']:
302
+ for i, doc_text in enumerate(results['documents'][0]):
303
+ source_info = results['metadatas'][0][i].get('source', 'Unknown Source')
304
+ chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
305
+ distance_info = results['distances'][0][i] # Smaller distance is more similar for cosine
306
+
307
+ retrieved_chunks_texts.append(doc_text)
308
+ print(f" Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{source_info}' (chunk {chunk_id_info}).")
309
+ else:
310
+ print(" No relevant chunks found in ChromaDB.")
311
+
312
+ return retrieved_chunks_texts
313
+
314
+ def answer_question(self, question: str, pdf_paths: list[str]) -> str:
315
+ """
316
+ Answers a question by ensuring PDFs are indexed, retrieving context,
317
+ and querying DeepSeek.
318
+ """
319
+ # Ensure documents are added for specified paths.
320
+ # This will now intelligently skip already indexed documents.
321
+ for path in pdf_paths:
322
+ self.add_document(path)
323
+
324
+ # Get relevant context from ChromaDB
325
+ context_chunks = self.retrieve_context(question)
326
+ context = "\n\n".join(context_chunks)
327
+
328
+ if not context:
329
+ print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")
330
+ context_prompt = ""
331
+ else:
332
+ context_prompt = f"Using the following context:\n\n{context}\n\n"
333
+
334
+ # Construct prompt for DeepSeek
335
+ messages = [
336
+ {"role": "system", "content": "You are an AI assistant specialized in answering questions based on provided context. If the answer is not in the context, state that explicitly. If you cannot answer based *solely* on the context, politely indicate that the information is not available in the provided documents."},
337
+ {"role": "user", "content": f"{context_prompt}Question: {question}"}
338
+ ]
339
+
340
+ # Call DeepSeek API via OpenRouter
341
+ print("\nSending request to DeepSeek API...")
342
+ data = {
343
+ "model": "deepseek/deepseek-chat:free", # Using the specified free model
344
+ "messages": messages,
345
+ "temperature": 0.5, # Adjust for creativity vs. factualness
346
+ "max_tokens": 500, # Limit response length
347
+ }
348
+
349
+ response = requests.post(API_URL, json=data, headers=HEADERS)
350
+
351
+ if response.status_code == 200:
352
+ ai_response = response.json()
353
+ answer = ai_response['choices'][0]['message']['content']
354
+ print("\nDeepSeek Response:")
355
+ print(answer)
356
+ return answer
357
+ else:
358
+ error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
359
+ print(error_message)
360
+ return f"Error: Could not get an answer from the AI. Details: {error_message}"
361
+
362
+ # --- Main execution logic ---
363
+ if __name__ == "__main__":
364
+ # Initialize the RAG system with ChromaDB persistence
365
+ # The 'chroma_db' directory will be created in your project root.
366
+ rag_system = DocumentRAG(
367
+ embedding_model=embedding_model,
368
+ persist_directory="./chroma_db", # This is where your vector DB will be saved
369
+ collection_name="pdf_documents_collection", # A unique name for your collection
370
+ chunk_size=700,
371
+ overlap=100
372
+ )
373
+
374
+ # --- Define your PDF documents ---
375
+ # Replace with the actual paths to your PDF files.
376
+ # For testing, ensure 'documents' directory exists and contains your PDFs.
377
+ pdf_document_paths = [
378
+ "documents/heracles_tr.pdf", # Heracles TR PDF path
379
+ "documents/heracles_en.pdf", # Heracles EN PDF path
380
+ # Add more PDF paths here if you have them
381
+ "documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf",
382
+ "documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf",
383
+ "documents/tmv-bursluluk-yonergesi.pdf"
384
+ ]
385
+
386
+ # --- Add PDFs to the RAG system for indexing ---
387
+ # This will now process only new or unindexed documents.
388
+ print("\n--- Indexing Documents ---")
389
+ for pdf_path in pdf_document_paths:
390
+ if os.path.exists(pdf_path):
391
+ rag_system.add_document(pdf_path)
392
+ else:
393
+ print(f"Error: PDF file not found at {pdf_path}. Please check the path.")
394
+
395
+ # --- Start Chat Loop ---
396
+ print("\n--- PDF Chat with DeepSeek (Type 'quit' to exit) ---")
397
+ while True:
398
+ user_question = input("\nYour question about the PDF(s): ")
399
+ if user_question.lower() == 'quit':
400
+ print("Exiting chat.")
401
+ break
402
+
403
+ # No need to pass pdf_document_paths here; documents are already in ChromaDB
404
+ rag_system.answer_question(user_question, []) # Pass an empty list, as documents are in DB
documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd1dc09026b3212985f74ff6c9f322f9c40883039911b034c84a879ddb1531bd
3
+ size 177096
documents/heracles_en.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:010a6ea71343b7d41f5b0a907f7487448e72c5dad1d37ab333e0b936d4bf5c4a
3
+ size 581483
documents/heracles_tr.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9879afb2856c8f8c5177ea1470607f5b4a45b49ffa6fd241b24e657a2f8e7a8
3
+ size 564377
documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0d72f118f8cf4c8bf8c3212ced62de48c3808b4325ce473cbd1c4418594e8d
3
+ size 549275
documents/tmv-bursluluk-yonergesi.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa5e97a0f06393484000d42b9258ca68ecf823b777661064c44683dc7602963d
3
+ size 337112
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ requests
2
+ pdf2image
3
+ pytesseract
4
+ FlagEmbedding
5
+ python-dotenv
6
+ pdfminer.six
7
+ Pillow
8
+ faiss-cpu
9
+
10
+ chromadb
11
+ fastapi
12
+ uvicorn # For serving the FastAPI application
13
+
14
+ # System dependencies for Tesseract and Poppler on Linux
15
+ # Hugging Face Spaces uses apt-get for these
16
+ apt_packages = python3-dev libtesseract-dev libleptonica-dev poppler-utils