dembasowmr commited on
Commit
15d9931
·
1 Parent(s): d607228

Reorganized the project: -Documents hosted on Firestore db, -conversations saved

Browse files
.gitattributes CHANGED
@@ -1,2 +1 @@
1
  documents/*.pdf filter=lfs diff=lfs merge=lfs -text
2
- chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
1
  documents/*.pdf filter=lfs diff=lfs merge=lfs -text
 
.gitignore CHANGED
@@ -6,4 +6,10 @@ all-libraries.txt
6
  # Ignore ChromaDB persistent storage
7
  chroma_db/
8
 
9
- temp*
 
 
 
 
 
 
 
6
  # Ignore ChromaDB persistent storage
7
  chroma_db/
8
 
9
+ temp*
10
+
11
+ __pycache__/
12
+
13
+ atemp/
14
+
15
+ documents/
README.md CHANGED
@@ -24,7 +24,7 @@ It uses:
24
  ## How to Use:
25
  This Space exposes a `/compassia/` API endpoint. You can interact with it using `curl`, Postman, Insomnia, or by integrating it with your Next.js frontend.
26
 
27
- ### API Endpoint: `/ask-pdf/` (POST request)
28
  **Request Body (JSON):**
29
  ```json
30
  {
 
24
  ## How to Use:
25
  This Space exposes a `/compassia/` API endpoint. You can interact with it using `curl`, Postman, Insomnia, or by integrating it with your Next.js frontend.
26
 
27
+ ### API Endpoint: `/compassia/` (POST request)
28
  **Request Body (JSON):**
29
  ```json
30
  {
app.py CHANGED
@@ -1,67 +1,90 @@
1
- # app.py
2
-
3
  import os
4
  from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel
6
  import uvicorn
7
  import sys
 
 
 
 
 
 
 
 
 
8
 
9
- # Add src to the Python path so we can import modules from it
10
- # This is crucial for deployment environments where 'src' might not be automatically recognized
11
- #sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
 
 
12
 
13
- # Import your DocumentRAG class and other necessary components from your backend script
14
- # Make sure your rag_backend.py has `embedding_model` defined globally or passed correctly
15
- from compassia import DocumentRAG, embedding_model, pdf_document_paths, extract_text_from_pdf, ocr_pdf, chunk_text # Import all needed functions/variables
16
 
17
  # --- Initialize the RAG system globally ---
18
- # This ensures the model loads and indexing happens once when the FastAPI app starts
19
- # and persists across requests within the same process.
20
- # ChromaDB will save its data to the './chroma_db' directory within the Space.
21
  print("--- FastAPI App Startup: Initializing RAG System ---")
22
  rag_system = DocumentRAG(
23
  embedding_model=embedding_model,
24
- persist_directory="./chroma_db", # ChromaDB will store data here in the Space
25
- collection_name="pdf_documents_collection",
26
- chunk_size=700, # Match your existing chunk size
27
- overlap=100 # Match your existing overlap
28
  )
29
 
30
  # --- Index documents on startup ---
31
  # This loop will run when the FastAPI app first starts.
32
  # It uses ChromaDB's persistence, so documents already indexed will be skipped.
33
- print("--- FastAPI App Startup: Indexing Documents (ChromaDB persistence) ---")
34
- for pdf_path in pdf_document_paths:
35
- # Ensure the path is correct relative to the Space's filesystem
36
- full_pdf_path = os.path.join(os.path.dirname(__file__), pdf_path)
37
- if os.path.exists(full_pdf_path):
38
- rag_system.add_document(full_pdf_path) # Call add_document on rag_system
39
- else:
40
- print(f"API Error: PDF file not found at {full_pdf_path}. Ensure it's deployed with your app in the 'documents' folder.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  print("--- FastAPI App Startup: Document indexing complete ---")
42
 
43
 
44
  # --- FastAPI Application Instance ---
45
  app = FastAPI(
46
  title="CompassIA",
47
- description="Backend API for querying PDFs using DeepSeek (via OpenRouter) and BGE-M3 embeddings.",
48
  version="0.1.0",
49
  )
50
 
51
  # Pydantic model for request body validation
52
  class QueryRequest(BaseModel):
53
  question: str
 
54
 
55
  # --- API Endpoint Definition ---
56
  @app.post("/compassia/")
57
  async def compassia_endpoint(request: QueryRequest):
58
  """
59
- Answers a question about the indexed PDF documents using RAG.
60
  """
61
  try:
62
- # Pass an empty list for pdf_paths as documents are already indexed in ChromaDB
63
- answer = rag_system.answer_question(request.question, [])
64
- return {"answer": answer}
65
  except Exception as e:
66
  print(f"Error processing /compassia/ request: {e}")
67
  raise HTTPException(status_code=500, detail=str(e))
@@ -70,9 +93,3 @@ async def compassia_endpoint(request: QueryRequest):
70
  @app.get("/")
71
  async def root():
72
  return {"message": "CompassIA API is running. Use /compassia/ for queries."}
73
-
74
- # You can run this locally for testing:
75
- # if __name__ == "__main__":
76
- # # This part runs locally if you execute app.py directly
77
- # # For deployment, uvicorn is typically run via a command line.
78
- # uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
 
1
  import os
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
  import uvicorn
5
  import sys
6
+ import json
7
+ import base64
8
+
9
+ # Add the 'src' directory to the Python path
10
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
11
+
12
+ # Import components from the new modular structure, specifically from src.compassia
13
+ from src.config import CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME # Removed PDF_DOCUMENT_PATHS as it's handled in rag_system
14
+ from src.compassia import DocumentRAG, embedding_model, initialize_firebase_client, db_firestore # Import db_firestore and related functions from compassia.py
15
 
16
+ # --- Firebase Initialization (Global, once per process) ---
17
+ # Initialize Firebase Admin SDK using a secret from Hugging Face Spaces
18
+ # This function is now called directly from app.py startup.
19
+ # db_firestore is already imported and will be set by initialize_firebase_client()
20
+ initialize_firebase_client()
21
 
 
 
 
22
 
23
  # --- Initialize the RAG system globally ---
 
 
 
24
  print("--- FastAPI App Startup: Initializing RAG System ---")
25
  rag_system = DocumentRAG(
26
  embedding_model=embedding_model,
27
+ persist_directory=CHROMADB_PERSIST_DIRECTORY,
28
+ collection_name=CHROMADB_COLLECTION_NAME
 
 
29
  )
30
 
31
  # --- Index documents on startup ---
32
  # This loop will run when the FastAPI app first starts.
33
  # It uses ChromaDB's persistence, so documents already indexed will be skipped.
34
+ # Now fetches document URLs directly from Firestore using db_firestore
35
+ print("--- FastAPI App Startup: Indexing Documents from Firestore ---")
36
+ if db_firestore:
37
+ try:
38
+ docs_ref = db_firestore.collection('documents').stream()
39
+ firestore_pdf_infos = []
40
+ for doc in docs_ref:
41
+ doc_data = doc.to_dict()
42
+ if 'fileUrl' in doc_data and doc_data['fileUrl'].endswith('.pdf'):
43
+ pdf_url = doc_data['fileUrl']
44
+ display_name = doc_data.get('name_en', os.path.basename(pdf_url))
45
+ firestore_pdf_infos.append({"url": pdf_url, "name": display_name})
46
+ print(f"Found PDF in Firestore: {display_name} ({pdf_url})")
47
+
48
+ if firestore_pdf_infos:
49
+ for pdf_info in firestore_pdf_infos:
50
+ rag_system.add_document(pdf_info['url'], pdf_info['name'])
51
+ else:
52
+ print("No PDF documents found in Firestore collection 'documents'.")
53
+ except Exception as e:
54
+ print(f"API Error: Error fetching documents from Firestore: {e}")
55
+ print("Please ensure your Firestore database is accessible and the service account key (FIREBASE_CONFIG_BASE64 secret) is correct.")
56
+ # If document fetching fails, consider if the app should still start or crash.
57
+ # For now, it will print the error but continue to try to start the API.
58
+ else:
59
+ print("API Error: Firestore not initialized. Cannot fetch documents from Firestore on startup.")
60
+ print("Ensure FIREBASE_CONFIG_BASE64 secret is correctly set.")
61
+
62
+
63
  print("--- FastAPI App Startup: Document indexing complete ---")
64
 
65
 
66
  # --- FastAPI Application Instance ---
67
  app = FastAPI(
68
  title="CompassIA",
69
+ description="Backend API for querying PDFs using DeepSeek (via OpenRouter) and BGE-M3 embeddings, with conversational memory.",
70
  version="0.1.0",
71
  )
72
 
73
  # Pydantic model for request body validation
74
  class QueryRequest(BaseModel):
75
  question: str
76
+ conversation_id: str = None # Optional for new conversations
77
 
78
  # --- API Endpoint Definition ---
79
  @app.post("/compassia/")
80
  async def compassia_endpoint(request: QueryRequest):
81
  """
82
+ Answers a question about the indexed PDF documents using RAG, with conversational memory.
83
  """
84
  try:
85
+ # Pass conversation_id to the answer_question function
86
+ answer = rag_system.answer_question(request.question, conversation_id=request.conversation_id)
87
+ return {"answer": answer, "conversation_id": request.conversation_id}
88
  except Exception as e:
89
  print(f"Error processing /compassia/ request: {e}")
90
  raise HTTPException(status_code=500, detail=str(e))
 
93
  @app.get("/")
94
  async def root():
95
  return {"message": "CompassIA API is running. Use /compassia/ for queries."}
 
 
 
 
 
 
compassia.py DELETED
@@ -1,385 +0,0 @@
1
- import sys
2
- # IMPORTANT: These lines MUST be at the very top of compassia.py
3
- # They ensure that any subsequent import of 'sqlite3' (even indirectly by chromadb)
4
- # will use the version provided by pysqlite3-binary.
5
- try:
6
- import pysqlite3
7
- sys.modules['sqlite3'] = pysqlite3
8
- except ImportError:
9
- pass # Fallback if pysqlite3 isn't available, but it should be in Docker
10
-
11
- import requests
12
- import os
13
- import io
14
- import re
15
- import uuid # For generating unique IDs for ChromaDB
16
- from PIL import Image
17
-
18
- # For text extraction from PDFs (non-OCR)
19
- from pdfminer.high_level import extract_text_to_fp
20
- from pdfminer.layout import LAParams
21
-
22
- # For image-based PDFs (OCR)
23
- from pdf2image import convert_from_path
24
- import pytesseract
25
-
26
- # For embeddings and vector search
27
- from FlagEmbedding import BGEM3FlagModel
28
- import chromadb # pip install chromadb
29
-
30
- # --- IMPORTANT: Configure Paths for Tesseract and Poppler ---
31
- # If Tesseract is not in your system's PATH, uncomment and set this locally:
32
- # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
33
-
34
- # If pdf2image gives errors about poppler, uncomment and set this locally:
35
- # poppler_path = r'C:\path\to\poppler\bin'
36
-
37
- # --- OpenRouter DeepSeek API Configuration ---
38
- API_KEY = os.getenv("DEEPSEEK_R1_V3_API_KEY")
39
- if API_KEY:
40
- API_KEY = API_KEY.strip()
41
-
42
- if not API_KEY:
43
- raise ValueError("API key is not set. Please set the DEEPSEEK_R1_V3_API_KEY environment variable with your OpenRouter key.")
44
-
45
- API_URL = 'https://openrouter.ai/api/v1/chat/completions'
46
- HEADERS = {
47
- 'Authorization': f'Bearer {API_KEY}',
48
- 'Content-Type': 'application/json'
49
- }
50
-
51
- # --- Embedding Model Configuration (Local BGE-M3) ---
52
- print("Loading FlagEmbedding (BGE-M3) model...")
53
- try:
54
- embedding_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
55
- print("FlagEmbedding (BGE-M3) model loaded successfully.")
56
- except Exception as e:
57
- print(f"Error loading FlagEmbedding model: {e}")
58
- print("Ensure you have resolved disk space issues for model download and have enough memory.")
59
- print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
60
- exit(1)
61
-
62
- # --- PDF Processing Functions ---
63
-
64
- def extract_text_from_pdf(pdf_path: str) -> str:
65
- """
66
- Extracts text from a PDF. Tries direct text extraction first.
67
- If sparse text is found (suggesting image-based PDF), it performs OCR.
68
- """
69
- print(f"Attempting direct text extraction from: {pdf_path}")
70
- output_string = io.StringIO()
71
- with open(pdf_path, 'rb') as fp:
72
- try:
73
- extract_text_to_fp(fp, output_string, laparams=LAParams())
74
- text = output_string.getvalue()
75
- if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
76
- print("Direct extraction yielded sparse text. Attempting OCR...")
77
- return ocr_pdf(pdf_path)
78
- return text
79
- except Exception as e:
80
- print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
81
- return ocr_pdf(pdf_path)
82
-
83
- def ocr_pdf(pdf_path: str) -> str:
84
- """
85
- Performs OCR on a PDF file using pdf2image and pytesseract.
86
- Requires Tesseract and Poppler to be installed and in system PATH.
87
- """
88
- all_text = []
89
- try:
90
- images = convert_from_path(pdf_path, dpi=300)
91
-
92
- print(f" Performing OCR on {len(images)} pages...")
93
- for i, img in enumerate(images):
94
- # Tesseract language packs:
95
- # 'eng' for English, 'tur' for Turkish
96
- # If you have scanned PDFs in Arabic or French, you MUST install
97
- # 'tesseract-ocr-ara' and 'tesseract-ocr-fra' in your Dockerfile
98
- # and change 'lang' to 'eng+tur+ara+fra'.
99
- page_text = pytesseract.image_to_string(img, lang='eng+tur')
100
- all_text.append(page_text)
101
- print(f" Page {i+1} OCR complete.")
102
-
103
- except Exception as e:
104
- print(f"OCR process failed: {e}")
105
- print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
106
- return ""
107
-
108
- return "\n".join(all_text)
109
-
110
- def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
111
- """
112
- Splits text into chunks of a maximum size with optional overlap.
113
- Aims to split by paragraphs/sentences first, then by word.
114
- """
115
- if not text:
116
- return []
117
-
118
- paragraphs = re.split(r'\n\s*\n', text)
119
- chunks = []
120
- current_chunk = []
121
- current_chunk_len = 0
122
-
123
- for para in paragraphs:
124
- if not para.strip():
125
- continue
126
-
127
- if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
128
- if current_chunk:
129
- chunks.append("\n\n".join(current_chunk))
130
- current_chunk = []
131
- current_chunk_len = 0
132
-
133
- if len(para) > max_chunk_size:
134
- words = para.split(' ')
135
- sub_chunk = []
136
- sub_chunk_len = 0
137
- for word in words:
138
- if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
139
- chunks.append(" ".join(sub_chunk))
140
- sub_chunk = [word]
141
- sub_chunk_len = len(word)
142
- else:
143
- sub_chunk.append(word)
144
- sub_chunk_len += len(word) + len(' ')
145
- if sub_chunk:
146
- chunks.append(" ".join(sub_chunk))
147
- else:
148
- current_chunk.append(para)
149
- current_chunk_len += len(para) + len('\n\n')
150
- else:
151
- current_chunk.append(para)
152
- current_chunk_len += len(para) + len('\n\n')
153
-
154
- if current_chunk:
155
- chunks.append("\n\n".join(current_chunk))
156
-
157
- final_chunks_with_overlap = []
158
- for i in range(len(chunks)):
159
- chunk = chunks[i]
160
- if i > 0 and overlap > 0:
161
- prev_chunk_part = chunks[i-1][-overlap:]
162
- chunk = prev_chunk_part + "\n" + chunk
163
- final_chunks_with_overlap.append(chunk)
164
-
165
- return final_chunks_with_overlap
166
-
167
- # --- RAG Core Functions with ChromaDB ---
168
-
169
- class DocumentRAG:
170
- def __init__(self, embedding_model, persist_directory="./chroma_db", collection_name="pdf_docs", chunk_size=700, overlap=100):
171
- self.embedding_model = embedding_model
172
- self.chunk_size = chunk_size
173
- self.overlap = overlap
174
- self.persist_directory = persist_directory
175
- self.collection_name = collection_name
176
-
177
- print(f"Initializing ChromaDB at: {self.persist_directory}")
178
- self.client = chromadb.PersistentClient(path=self.persist_directory)
179
-
180
- self.collection = self.client.get_or_create_collection(
181
- name=self.collection_name,
182
- metadata={"hnsw:space": "cosine"}
183
- )
184
- print(f"ChromaDB collection '{self.collection_name}' ready.")
185
-
186
- def _generate_chunk_id(self, pdf_path: str, chunk_idx: int) -> str:
187
- return f"{os.path.basename(pdf_path)}_{chunk_idx}_{uuid.uuid4().hex}"
188
-
189
- def add_document(self, pdf_path: str):
190
- print(f"Adding document: {pdf_path}")
191
-
192
- results = self.collection.get(
193
- where={"source": pdf_path},
194
- limit=1
195
- )
196
- if results and results['ids']:
197
- print(f" Document '{pdf_path}' already in ChromaDB. Skipping re-indexing.")
198
- return
199
-
200
- extracted_text = extract_text_from_pdf(pdf_path)
201
- if not extracted_text:
202
- print(f"Warning: No text extracted from {pdf_path}. Skipping.")
203
- return
204
-
205
- chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
206
- if not chunks:
207
- print(f"Warning: No chunks generated for {pdf_path}. Skipping.")
208
- return
209
-
210
- documents_to_add = []
211
- metadatas_to_add = []
212
- ids_to_add = []
213
-
214
- print(f" Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB...")
215
-
216
- encoded_results = self.embedding_model.encode(
217
- chunks,
218
- batch_size=32,
219
- return_dense=True,
220
- return_sparse=False,
221
- return_colbert_vecs=False
222
- )
223
-
224
- chunk_embeddings = encoded_results["dense_vecs"]
225
-
226
- for i, chunk in enumerate(chunks):
227
- unique_id = self._generate_chunk_id(pdf_path, i)
228
- documents_to_add.append(chunk)
229
- metadatas_to_add.append({"source": pdf_path, "chunk_id": i})
230
- ids_to_add.append(unique_id)
231
-
232
- self.collection.add(
233
- documents=documents_to_add,
234
- embeddings=chunk_embeddings.tolist(),
235
- metadatas=metadatas_to_add,
236
- ids=ids_to_add
237
- )
238
-
239
- print(f" {len(documents_to_add)} chunks from '{pdf_path}' added to ChromaDB.")
240
- print(f" Total chunks in collection: {self.collection.count()}")
241
-
242
- def retrieve_context(self, query: str, top_k: int = 3) -> list[str]:
243
- """
244
- Retrieves top_k most relevant document chunks for a given query from ChromaDB.
245
- """
246
- if self.collection.count() == 0:
247
- print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
248
- return []
249
-
250
- print(f"Retrieving context for query: '{query}'")
251
-
252
- query_embedding_result = self.embedding_model.encode(
253
- [query],
254
- batch_size=1,
255
- return_dense=True,
256
- return_sparse=False,
257
- return_colbert_vecs=False
258
- )
259
- query_embedding = query_embedding_result["dense_vecs"].tolist()
260
-
261
- results = self.collection.query(
262
- query_embeddings=query_embedding,
263
- n_results=top_k,
264
- include=['documents', 'distances', 'metadatas']
265
- )
266
-
267
- retrieved_chunks_texts = []
268
- if results and results['documents']:
269
- for i, doc_text in enumerate(results['documents'][0]):
270
- source_info = results['metadatas'][0][i].get('source', 'Unknown Source')
271
- chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
272
- distance_info = results['distances'][0][i]
273
-
274
- retrieved_chunks_texts.append(doc_text)
275
- print(f" Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{source_info}' (chunk {chunk_id_info}).")
276
- else:
277
- print(" No relevant chunks found in ChromaDB.")
278
-
279
- return retrieved_chunks_texts
280
-
281
- def answer_question(self, question: str, pdf_paths: list[str]) -> str:
282
- """
283
- Answers a question by ensuring PDFs are indexed, retrieving context,
284
- and querying DeepSeek.
285
- """
286
- for path in pdf_paths:
287
- self.add_document(path)
288
-
289
- context_chunks = self.retrieve_context(question)
290
- context = "\n\n".join(context_chunks)
291
-
292
- if not context:
293
- print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")
294
- context_prompt = ""
295
- else:
296
- context_prompt = f"Using the following context:\n\n{context}\n\n"
297
-
298
- # --- UPDATED SYSTEM PROMPT FOR COMPASSIA AI ---
299
- system_prompt = """
300
- You are CompassIA, the intelligent assistant for MaarifCompass, committed to supporting Turkiye Maarif Foundation graduates residing in Turkiye.
301
-
302
- Your core function is to deliver precise, document-backed information concerning their needs, primarily focusing on:
303
- - University application procedures, requirements, tuition fees, and scholarship opportunities
304
- - Accommodation and housing resources
305
- - Career networking and professional development
306
- - Relevant administrative and support services.
307
- - Information related to Turkiye Maarif Foundation, Turkiye Scholarship and more.
308
-
309
- You operate exclusively with data from a designated Document Center. **It is imperative that every piece of information you provide is directly sourced and verifiable from these internal documents.**
310
-
311
- **Should a query fall outside the scope of the provided documents or lack a direct answer within them, you are required to politely inform the user that the specific information is not available in your current knowledge base, without offering any external insights or assumptions.**
312
-
313
- Your answers should be highly accurate, directly relevant, easy to understand, and always prioritize the user's query based strictly on documented facts.
314
- **Remember, you always answer the user with the language of the question.**
315
- """
316
-
317
- messages = [
318
- {"role": "system", "content": system_prompt},
319
- {"role": "user", "content": f"{context_prompt}Question: {question}"}
320
- ]
321
-
322
- print("\nSending request to DeepSeek API...")
323
- data = {
324
- "model": "deepseek/deepseek-chat:free",
325
- "messages": messages,
326
- "temperature": 0.5,
327
- "max_tokens": 500,
328
- }
329
-
330
- response = requests.post(API_URL, json=data, headers=HEADERS)
331
-
332
- if response.status_code == 200:
333
- ai_response = response.json()
334
- answer = ai_response['choices'][0]['message']['content']
335
- print("\nDeepSeek Response:")
336
- print(answer)
337
- return answer
338
- else:
339
- error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
340
- print(error_message)
341
- return f"Error: Could not get an answer from the AI. Details: {error_message}"
342
-
343
- # --- Define PDF documents (MOVED TO GLOBAL SCOPE) ---
344
- pdf_document_paths = [
345
- "documents/heracles_tr.pdf",
346
- "documents/heracles_en.pdf",
347
- "documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf",
348
- "documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf",
349
- "documents/tmv-bursluluk-yonergesi.pdf"
350
- ]
351
-
352
-
353
- # --- Main execution logic ---
354
- if __name__ == "__main__":
355
- rag_system = DocumentRAG(
356
- embedding_model=embedding_model,
357
- persist_directory="./chroma_db",
358
- collection_name="pdf_documents_collection",
359
- chunk_size=700,
360
- overlap=100
361
- )
362
- """
363
- pdf_document_paths = [
364
- "documents/heracles_tr.pdf",
365
- "documents/heracles_en.pdf",
366
- "documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf",
367
- "documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf",
368
- "documents/tmv-bursluluk-yonergesi.pdf"
369
- ]
370
- """
371
- print("\n--- Indexing Documents ---")
372
- for pdf_path in pdf_document_paths:
373
- if os.path.exists(pdf_path):
374
- rag_system.add_document(pdf_path)
375
- else:
376
- print(f"Error: PDF file not found at {pdf_path}. Please check the path.")
377
-
378
- print("\n--- Chat With CompassIA (Type 'quit' to exit) ---")
379
- while True:
380
- user_question = input("\nHow can I help you? ")
381
- if user_question.lower() == 'quit':
382
- print("Exiting chat.")
383
- break
384
-
385
- rag_system.answer_question(user_question, pdf_document_paths)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd1dc09026b3212985f74ff6c9f322f9c40883039911b034c84a879ddb1531bd
3
- size 177096
 
 
 
 
documents/heracles_en.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:010a6ea71343b7d41f5b0a907f7487448e72c5dad1d37ab333e0b936d4bf5c4a
3
- size 581483
 
 
 
 
documents/heracles_tr.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9879afb2856c8f8c5177ea1470607f5b4a45b49ffa6fd241b24e657a2f8e7a8
3
- size 564377
 
 
 
 
documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb0d72f118f8cf4c8bf8c3212ced62de48c3808b4325ce473cbd1c4418594e8d
3
- size 549275
 
 
 
 
documents/tmv-bursluluk-yonergesi.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa5e97a0f06393484000d42b9258ca68ecf823b777661064c44683dc7602963d
3
- size 337112
 
 
 
 
requirements.txt CHANGED
@@ -10,9 +10,11 @@ faiss-cpu
10
  chromadb
11
  fastapi
12
  uvicorn # For serving the FastAPI application
13
- pysqlite3-binary
14
 
15
  # System dependencies for Tesseract and Poppler on Linux
16
  # Hugging Face Spaces uses apt-get for these
17
  #apt_packages = python3-dev libtesseract-dev libleptonica-dev poppler-utils
18
 
 
 
 
10
  chromadb
11
  fastapi
12
  uvicorn # For serving the FastAPI application
13
+ #pysqlite3-binary
14
 
15
  # System dependencies for Tesseract and Poppler on Linux
16
  # Hugging Face Spaces uses apt-get for these
17
  #apt_packages = python3-dev libtesseract-dev libleptonica-dev poppler-utils
18
 
19
+ firebase-admin
20
+ firebase
src/compassia.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ # IMPORTANT: These lines MUST be at the very top of compassia.py
3
+ # They ensure that any subsequent import of 'sqlite3' (even indirectly by chromadb)
4
+ # will use the version provided by pysqlite3-binary.
5
+ """
6
+ try:
7
+ import pysqlite3
8
+ sys.modules['sqlite3'] = pysqlite3
9
+ except ImportError:
10
+ # This should not happen in the Docker environment as pysqlite3-binary is in requirements.txt
11
+ print("Warning: pysqlite3 not found. Falling back to default sqlite3. ChromaDB might fail if it's too old.")
12
+ pass
13
+ """
14
+ import requests
15
+ import os
16
+ import uuid
17
+ import json
18
+ import base64 # For decoding Firebase config
19
+ import hashlib # For hashing URLs for chunk IDs
20
+ import urllib.parse # For parsing URLs
21
+ import io # Import the io module for BytesIO
22
+
23
+ # Firebase Admin SDK for Firestore
24
+ import firebase_admin
25
+ from firebase_admin import credentials, firestore
26
+
27
+ # For embeddings and vector search
28
+ from FlagEmbedding import BGEM3FlagModel
29
+ import chromadb
30
+
31
+ # Import configurations and prompt from local modules
32
+ # These imports assume 'src' directory is on the Python path or script is run from 'src'
33
+ from config import (
34
+ DEEPSEEK_API_URL, DEEPSEEK_HEADERS,
35
+ EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_USE_FP16,
36
+ CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME,
37
+ CHUNK_SIZE, CHUNK_OVERLAP,
38
+ LLM_TEMPERATURE, LLM_MAX_TOKENS, LLM_HISTORY_MAX_TOKENS,
39
+ FIREBASE_CONFIG_BASE64
40
+ )
41
+ from pdf_processing import extract_text_from_pdf, chunk_text # Import functions from pdf_processing
42
+ from prompt import SYSTEM_PROMPT # Import the system prompt
43
+
44
+
45
+ # --- Global Firebase Firestore Client ---
46
+ FIRESTORE_DATABASE = None
47
+
48
+ def initialize_firebase_client():
49
+ """Initializes Firebase Admin SDK and returns the Firestore client."""
50
+ global FIRESTORE_DATABASE
51
+ # Check if Firebase Admin SDK is already initialized
52
+ if not firebase_admin._apps:
53
+ # Only attempt to initialize if FIREBASE_CONFIG_BASE64 is provided
54
+ if FIREBASE_CONFIG_BASE64:
55
+ try:
56
+ cred_json = base64.b64decode(FIREBASE_CONFIG_BASE64).decode('utf-8')
57
+ cred_dict = json.loads(cred_json)
58
+ cred = credentials.Certificate(cred_dict)
59
+ firebase_admin.initialize_app(cred)
60
+ print("Firebase Admin SDK initialized successfully.")
61
+ FIRESTORE_DATABASE = firestore.client()
62
+ print("Firestore client initialized successfully.")
63
+ return FIRESTORE_DATABASE
64
+ except Exception as e:
65
+ print(f"Error initializing Firebase Admin SDK: {e}")
66
+ print("Please ensure FIREBASE_CONFIG_BASE64 is correctly set and is a valid Base64-encoded Service Account JSON.")
67
+ FIRESTORE_DATABASE = None
68
+ return None
69
+ else:
70
+ print("Warning: FIREBASE_CONFIG_BASE64 environment variable not found. Firestore will not be available.")
71
+ FIRESTORE_DATABASE = None
72
+ return None
73
+ else: # Already initialized
74
+ print("Firebase Admin SDK already initialized.")
75
+ FIRESTORE_DATABASE = firestore.client() # Ensure global variable is set if already initialized
76
+ return FIRESTORE_DATABASE
77
+
78
+ # --- Embedding Model Initialization ---
79
+ print("Loading FlagEmbedding (BGE-M3) model...")
80
+ try:
81
+ embedding_model = BGEM3FlagModel(EMBEDDING_MODEL_NAME, use_fp16=EMBEDDING_MODEL_USE_FP16)
82
+ print("FlagEmbedding (BGE-M3) model loaded successfully.")
83
+ except Exception as e:
84
+ print(f"Error loading FlagEmbedding model: {e}")
85
+ print("Ensure disk space and memory are sufficient for model download.")
86
+ print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
87
+ exit(1)
88
+
89
+
90
+ class DocumentRAG:
91
+ def __init__(self, embedding_model, persist_directory=CHROMADB_PERSIST_DIRECTORY, collection_name=CHROMADB_COLLECTION_NAME):
92
+ self.embedding_model = embedding_model
93
+ self.persist_directory = persist_directory
94
+ self.collection_name = collection_name
95
+ self.chunk_size = CHUNK_SIZE
96
+ self.overlap = CHUNK_OVERLAP
97
+
98
+ print(f"Initializing ChromaDB at: {self.persist_directory}")
99
+ self.client = chromadb.PersistentClient(path=self.persist_directory)
100
+
101
+ self.collection = self.client.get_or_create_collection(
102
+ name=self.collection_name,
103
+ metadata={"hnsw:space": "cosine"}
104
+ )
105
+ print(f"ChromaDB collection '{self.collection_name}' ready. Total chunks: {self.collection.count()}")
106
+
107
+ def _generate_chunk_id(self, pdf_url: str, chunk_idx: int) -> str:
108
+ """Generates a unique ID for each chunk based on PDF URL and index."""
109
+ import hashlib
110
+ url_hash = hashlib.sha256(pdf_url.encode()).hexdigest()[:10]
111
+ return f"{url_hash}_{chunk_idx}_{uuid.uuid4().hex}"
112
+
113
+ def add_document(self, pdf_url: str, document_name: str = None):
114
+ """
115
+ Adds a PDF document to the RAG system, processing and indexing its content.
116
+ Downloads the PDF from the URL.
117
+ """
118
+ # Determine display name from parsed URL path if not provided
119
+ parsed_url_path = urllib.parse.urlparse(pdf_url).path
120
+ display_name = document_name if document_name else os.path.basename(parsed_url_path)
121
+ print(f"Adding document from URL: {pdf_url} (Display Name: {display_name})")
122
+
123
+ results = self.collection.get(
124
+ where={"source": pdf_url},
125
+ limit=1
126
+ )
127
+ if results and results['ids']:
128
+ print(f" Document '{display_name}' (from {pdf_url}) already in ChromaDB. Skipping re-indexing.")
129
+ return
130
+
131
+ try:
132
+ response = requests.get(pdf_url, stream=True)
133
+ print(f" DEBUG: HTTP Status Code for {pdf_url}: {response.status_code}") # NEW DEBUG PRINT
134
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
135
+
136
+ pdf_data = io.BytesIO(response.content)
137
+ print(f" DEBUG: BytesIO content length for {pdf_url}: {pdf_data.getbuffer().nbytes} bytes") # NEW DEBUG PRINT
138
+
139
+ if pdf_data.getbuffer().nbytes == 0:
140
+ raise ValueError("Downloaded PDF content is empty.") # Raise error if content is empty
141
+
142
+ temp_pdf_path = f"/tmp/{uuid.uuid4().hex}.pdf"
143
+ # It's better to ensure the directory exists, although /tmp usually does
144
+ os.makedirs(os.path.dirname(temp_pdf_path), exist_ok=True) # Ensure /tmp directory exists
145
+
146
+ with open(temp_pdf_path, 'wb') as f:
147
+ f.write(pdf_data.getvalue())
148
+ print(f" DEBUG: Temporary PDF saved to: {temp_pdf_path}") # NEW DEBUG PRINT
149
+
150
+ extracted_text = extract_text_from_pdf(temp_pdf_path)
151
+ os.remove(temp_pdf_path) # Clean up the temporary file after extraction
152
+
153
+ except requests.exceptions.RequestException as e:
154
+ print(f"Error downloading PDF from {pdf_url}: {e}")
155
+ return
156
+ except ValueError as e: # Catch the new ValueError for empty content
157
+ print(f"Error processing downloaded PDF {pdf_url}: {e}")
158
+ return
159
+ except Exception as e: # Catch any other unexpected errors during file ops or extraction
160
+ print(f"Error processing downloaded PDF {pdf_url}: {e}")
161
+ return
162
+
163
+
164
+ if not extracted_text:
165
+ print(f"Warning: No text extracted from {display_name} ({pdf_url}). Skipping.")
166
+ return
167
+
168
+ chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
169
+ if not chunks:
170
+ print(f"Warning: No chunks generated for {display_name} ({pdf_url}). Skipping.")
171
+ return
172
+
173
+ documents_to_add = []
174
+ metadatas_to_add = []
175
+ ids_to_add = []
176
+
177
+ print(f" Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB: {display_name}...")
178
+
179
+ encoded_results = self.embedding_model.encode(
180
+ chunks,
181
+ batch_size=32,
182
+ return_dense=True,
183
+ return_sparse=False,
184
+ return_colbert_vecs=False
185
+ )
186
+ chunk_embeddings = encoded_results["dense_vecs"]
187
+
188
+ for i, chunk in enumerate(chunks):
189
+ unique_id = self._generate_chunk_id(pdf_url, i)
190
+ documents_to_add.append(chunk)
191
+ metadatas_to_add.append({"source": pdf_url, "display_name": display_name, "chunk_id": i})
192
+ ids_to_add.append(unique_id)
193
+
194
+ self.collection.add(
195
+ documents=documents_to_add,
196
+ embeddings=chunk_embeddings.tolist(),
197
+ metadatas=metadatas_to_add,
198
+ ids=ids_to_add
199
+ )
200
+
201
+ print(f" {len(documents_to_add)} chunks from '{display_name}' added to ChromaDB.")
202
+ print(f" Total chunks in collection: {self.collection.count()}")
203
+
204
+ def retrieve_context(self, query: str, top_k: int = 3) -> list[dict]:
205
+ """
206
+ Retrieves top_k most relevant document chunks for a given query from ChromaDB.
207
+ Returns a list of dictionaries, each containing 'text' and 'source' (URL or display name).
208
+ """
209
+ if self.collection.count() == 0:
210
+ print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
211
+ return []
212
+
213
+ print(f"Retrieving context for query: '{query}'")
214
+
215
+ query_embedding_result = self.embedding_model.encode(
216
+ [query],
217
+ batch_size=1,
218
+ return_dense=True,
219
+ return_sparse=False,
220
+ return_colbert_vecs=False
221
+ )
222
+ query_embedding = query_embedding_result["dense_vecs"].tolist()
223
+
224
+ results = self.collection.query(
225
+ query_embeddings=query_embedding,
226
+ n_results=top_k,
227
+ include=['documents', 'distances', 'metadatas']
228
+ )
229
+
230
+ retrieved_chunks_info = []
231
+ if results and results['documents']:
232
+ for i, doc_text in enumerate(results['documents'][0]):
233
+ source_url = results['metadatas'][0][i].get('source', 'Unknown URL')
234
+ display_name = results['metadatas'][0][i].get('display_name', os.path.basename(urllib.parse.urlparse(source_url).path))
235
+ chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
236
+ distance_info = results['distances'][0][i]
237
+
238
+ retrieved_chunks_info.append({
239
+ "text": doc_text,
240
+ "source_url": source_url,
241
+ "display_name": display_name
242
+ })
243
+ print(f" Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{display_name}' (chunk {chunk_id_info}).")
244
+ else:
245
+ print(" No relevant chunks found in ChromaDB.")
246
+
247
+ return retrieved_chunks_info
248
+
249
+ def get_conversation_history(self, conversation_id: str) -> list[dict]:
250
+ """Loads chat history from Firestore for a given conversation ID."""
251
+ if FIRESTORE_DATABASE is None:
252
+ print("Firestore not initialized. Cannot load conversation history.")
253
+ return []
254
+
255
+ doc_ref = FIRESTORE_DATABASE.collection('conversations').document(conversation_id)
256
+ doc = doc_ref.get()
257
+ if doc.exists:
258
+ history = doc.to_dict().get('messages', [])
259
+ print(f"Loaded history for {conversation_id}: {len(history)} messages.")
260
+ return history
261
+ print(f"No history found for conversation ID: {conversation_id}")
262
+ return []
263
+
264
+ def save_conversation_history(self, conversation_id: str, history: list[dict]):
265
+ """Saves chat history to Firestore for a given conversation ID."""
266
+ if FIRESTORE_DATABASE is None:
267
+ print("Firestore not initialized. Cannot save conversation history.")
268
+ return
269
+
270
+ doc_ref = FIRESTORE_DATABASE.collection('conversations').document(conversation_id)
271
+ doc_ref.set({'messages': history})
272
+ print(f"Saved history for {conversation_id}: {len(history)} messages.")
273
+
274
+ def truncate_history(self, messages: list[dict], max_tokens: int = LLM_HISTORY_MAX_TOKENS) -> list[dict]:
275
+ """
276
+ Truncates conversation history to fit within a max_tokens limit for the LLM.
277
+ This is a simplistic truncation and doesn't use a tokenizer for exact token count.
278
+ """
279
+ current_len = sum(len(m['content']) for m in messages)
280
+ while current_len > max_tokens and len(messages) > 1: # Keep at least 1 message
281
+ if messages[0]['role'] == 'system':
282
+ if len(messages) >= 3:
283
+ removed_user_msg = messages.pop(1)
284
+ removed_ai_msg = messages.pop(1)
285
+ current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
286
+ else:
287
+ break
288
+ else:
289
+ removed_user_msg = messages.pop(0)
290
+ removed_ai_msg = messages.pop(0)
291
+ current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
292
+ return messages
293
+
294
+
295
+ def answer_question(self, question: str, conversation_id: str = None) -> str:
296
+ """
297
+ Answers a question by retrieving context, and querying DeepSeek.
298
+ Now includes conversational memory and uses global configs.
299
+ """
300
+ # Note: Document indexing is handled at FastAPI app startup for persistence.
301
+
302
+ # Get relevant context from ChromaDB
303
+ context_chunks_info = self.retrieve_context(question)
304
+
305
+ context_parts = []
306
+ citation_info = {}
307
+
308
+ for chunk_info in context_chunks_info:
309
+ context_parts.append(chunk_info["text"])
310
+ source_key = chunk_info.get("display_name", chunk_info["source_url"])
311
+ if source_key not in citation_info:
312
+ citation_info[source_key] = True
313
+
314
+ context = "\n\n".join(context_parts)
315
+
316
+ context_prompt = ""
317
+ if context:
318
+ context_prompt = f"Using the following context:\n\n{context}\n\n"
319
+ else:
320
+ print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")
321
+
322
+ # --- Handle Conversational Memory ---
323
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
324
+
325
+ if conversation_id:
326
+ history = self.get_conversation_history(conversation_id)
327
+ if history:
328
+ messages.extend(history)
329
+
330
+ # Add current context and question
331
+ messages.append({"role": "user", "content": f"{context_prompt}Question: {question}"})
332
+
333
+ # Truncate conversation history if it's too long
334
+ messages = self.truncate_history(messages)
335
+
336
+ # Call DeepSeek API via OpenRouter
337
+ print("\nSending request to DeepSeek API...")
338
+ data = {
339
+ "model": "deepseek/deepseek-chat:free",
340
+ "messages": messages,
341
+ "temperature": LLM_TEMPERATURE,
342
+ "max_tokens": LLM_MAX_TOKENS,
343
+ }
344
+
345
+ response = requests.post(DEEPSEEK_API_URL, json=data, headers=DEEPSEEK_HEADERS)
346
+
347
+ if response.status_code == 200:
348
+ ai_response = response.json()
349
+ answer = ai_response['choices'][0]['message']['content']
350
+ print("\nDeepSeek Response:")
351
+ print(answer)
352
+
353
+ if citation_info:
354
+ citations_str = "\n\n**Sources:**\n" + "\n".join([f"- {name}" for name in citation_info.keys()])
355
+ answer += citations_str
356
+
357
+ if conversation_id:
358
+ messages.append({"role": "assistant", "content": answer})
359
+ self.save_conversation_history(conversation_id, messages)
360
+
361
+ return answer
362
+ else:
363
+ error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
364
+ print(error_message)
365
+ return f"Error: Could not get an answer from the AI. Details: {error_message}"
366
+
367
+ # --- Main execution logic for local testing ---
368
+ if __name__ == "__main__":
369
+ from dotenv import load_dotenv # Import load_dotenv for local execution
370
+ # Load environment variables from .env.local in the project root
371
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env.local'))
372
+
373
+ # Retrieve FIREBASE_CONFIG_BASE64 after loading dotenv
374
+ from config import FIREBASE_CONFIG_BASE64
375
+
376
+ initialize_firebase_client()
377
+
378
+ rag_system = DocumentRAG(
379
+ embedding_model=embedding_model,
380
+ persist_directory=CHROMADB_PERSIST_DIRECTORY,
381
+ collection_name=CHROMADB_COLLECTION_NAME
382
+ )
383
+
384
+ print("\n--- Indexing Documents ---")
385
+ if FIRESTORE_DATABASE:
386
+ try:
387
+ docs_ref = FIRESTORE_DATABASE.collection('documents').stream()
388
+ firestore_pdf_infos = []
389
+ documents_processed_count = 0 # Track total documents found in Firestore
390
+ documents_skipped_non_pdf_count = 0 # Track documents skipped due to non-PDF URL
391
+
392
+ for doc in docs_ref:
393
+ documents_processed_count += 1
394
+ doc_data = doc.to_dict()
395
+ print(f" DEBUG: Processing document ID: {doc.id}, Data: {doc_data}")
396
+
397
+ if 'fileUrl' in doc_data:
398
+ pdf_url = doc_data['fileUrl']
399
+ print(f" DEBUG: Found 'fileUrl': {pdf_url}")
400
+
401
+ # Parse the URL to get the path part (without query parameters)
402
+ parsed_url = urllib.parse.urlparse(pdf_url)
403
+ file_path = parsed_url.path
404
+
405
+ # Extract the filename from the path and normalize it
406
+ file_name = os.path.basename(file_path)
407
+
408
+ # DEBUGGING: Print the extracted file_name and the result of the check
409
+ print(f" DEBUG: Extracted file_name: '{file_name}'")
410
+ is_pdf_check = isinstance(file_name, str) and file_name.strip().lower().endswith('.pdf')
411
+ print(f" DEBUG: is_pdf_check result: {is_pdf_check}")
412
+
413
+ if is_pdf_check:
414
+ display_name = doc_data.get('name_en', file_name) # Use file_name if name_en is missing
415
+ firestore_pdf_infos.append({"url": pdf_url, "name": display_name})
416
+ print(f"Found PDF in Firestore: {display_name} ({pdf_url}) - Qualified for indexing.")
417
+ else:
418
+ documents_skipped_non_pdf_count += 1
419
+ # Corrected debug print for non-PDFs to show the file_name being evaluated
420
+ print(f" DEBUG: Skipped: '{file_name}' (Type: {type(file_name)}) does not end with '.pdf' (case-insensitive, stripped). Original URL: '{pdf_url}'")
421
+ else:
422
+ documents_skipped_non_pdf_count += 1
423
+ print(f" DEBUG: Document ID: {doc.id} does not contain 'fileUrl'. Document data: {doc_data}")
424
+
425
+ if documents_processed_count == 0:
426
+ print("No documents found in Firestore collection 'documents' via stream(). Please check collection name and security rules.")
427
+ elif documents_processed_count > 0 and not firestore_pdf_infos:
428
+ print(f"Found {documents_processed_count} documents in Firestore, but none matched the '.pdf' criteria (all {documents_skipped_non_pdf_count} documents skipped).")
429
+ elif documents_skipped_non_pdf_count > 0:
430
+ print(f"Found {documents_processed_count} documents in Firestore. {len(firestore_pdf_infos)} PDFs qualified, {documents_skipped_non_pdf_count} documents skipped (non-PDF or missing fileUrl).")
431
+
432
+
433
+ if firestore_pdf_infos:
434
+ for pdf_info in firestore_pdf_infos:
435
+ rag_system.add_document(pdf_info['url'], pdf_info['name'])
436
+ else:
437
+ pass # Specific messages already printed above.
438
+
439
+ except Exception as e:
440
+ print(f"Error fetching documents from Firestore: {e}")
441
+ print("Please ensure your Firestore database is accessible and the service account key is correct.")
442
+ else:
443
+ print("Firestore client not initialized. Cannot fetch documents from Firestore.")
444
+ print("Using local PDF_DOCUMENT_PATHS as a fallback for testing purposes (ensure these files exist).")
445
+ # This import is moved here to avoid circular dependency if config imports rag_system
446
+ from .config import PDF_DOCUMENT_PATHS
447
+ for pdf_path in PDF_DOCUMENT_PATHS:
448
+ if os.path.exists(pdf_path):
449
+ rag_system.add_document(pdf_path)
450
+ else:
451
+ print(f"Error: Local PDF file not found at {pdf_path}. Skipping.")
452
+
453
+
454
+ print("\n--- Chat With CompassIA (Type 'q' to exit) ---")
455
+ current_conversation_id = str(uuid.uuid4())
456
+ print(f"Starting new local conversation with ID: {current_conversation_id}")
457
+
458
+ while True:
459
+ user_question = input("\nHow can I help you? ")
460
+ if user_question.lower() == 'q':
461
+ print("Exiting chat.")
462
+ break
463
+
464
+ rag_system.answer_question(user_question, conversation_id=current_conversation_id)
src/config.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # --- OpenRouter DeepSeek API Configuration ---
4
+ # Your DeepSeek API key, fetched from environment variables.
5
+ # This should be set as a secret on Hugging Face Spaces.
6
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_R1_V3_API_KEY")
7
+ if DEEPSEEK_API_KEY:
8
+ DEEPSEEK_API_KEY = DEEPSEEK_API_KEY.strip()
9
+
10
+ # Base URL for the OpenRouter API.
11
+ DEEPSEEK_API_URL = 'https://openrouter.ai/api/v1/chat/completions'
12
+
13
+ # Headers required for OpenRouter API authentication.
14
+ DEEPSEEK_HEADERS = {
15
+ 'Authorization': f'Bearer {DEEPSEEK_API_KEY}',
16
+ 'Content-Type': 'application/json'
17
+ }
18
+
19
+ # --- Embedding Model Configuration ---
20
+ # Name of the Hugging Face model for embeddings.
21
+ EMBEDDING_MODEL_NAME = 'BAAI/bge-m3'
22
+ # Use float16 for reduced memory usage if supported by hardware (e.g., GPU).
23
+ # Set to False if encountering issues on CPU.
24
+ EMBEDDING_MODEL_USE_FP16 = True
25
+
26
+ # --- ChromaDB Configuration ---
27
+ # Directory where ChromaDB will persist its database files.
28
+ # This should be relative to your application's working directory.
29
+ CHROMADB_PERSIST_DIRECTORY = "./chroma_db"
30
+ # Name of the collection within ChromaDB where document chunks will be stored.
31
+ CHROMADB_COLLECTION_NAME = "pdf_documents_collection"
32
+
33
+ # --- Document Chunking Configuration ---
34
+ # Maximum size of text chunks for embedding and retrieval.
35
+ CHUNK_SIZE = 700
36
+ # Overlap between consecutive chunks to maintain context.
37
+ CHUNK_OVERLAP = 100
38
+
39
+ # --- LLM Response Parameters ---
40
+ # Temperature for the DeepSeek model. Lower values make output more deterministic.
41
+ LLM_TEMPERATURE = 0.5
42
+ # Maximum number of tokens the LLM can generate in a response.
43
+ LLM_MAX_TOKENS = 500
44
+ # Max tokens for conversation history truncation (approximate, not exact token count)
45
+ LLM_HISTORY_MAX_TOKENS = 3000
46
+
47
+ # --- Tesseract and Poppler Configuration (Docker/Deployment Specific) ---
48
+ # Environment variables set in Dockerfile for Tesseract.
49
+ TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/4.00/tessdata")
50
+ TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
51
+ # Path to Poppler's bin directory if not in system PATH (mostly for local Windows setup).
52
+ POPPLER_PATH = None # e.g., r'C:\path\to\poppler\bin'
53
+
54
+
55
+
56
+ # --- Firebase Configuration (for Conversational Memory) ---
57
+ # Base64 encoded JSON string of your Firebase Service Account Key.
58
+ # This should be set as a secret on Hugging Face Spaces.
59
+ FIREBASE_CONFIG_BASE64 = os.getenv("FIREBASE_CONFIG_BASE64")
src/pdf_processing.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import re
4
+ from PIL import Image
5
+
6
+ # For text extraction from PDFs (non-OCR)
7
+ from pdfminer.high_level import extract_text_to_fp
8
+ from pdfminer.layout import LAParams
9
+
10
+ # For image-based PDFs (OCR)
11
+ from pdf2image import convert_from_path
12
+ import pytesseract
13
+
14
+ # Import Tesseract configuration from config.py
15
+ from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH
16
+
17
+ # Set Tesseract command explicitly (uses ENV from Dockerfile or default)
18
+ pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
19
+ if POPPLER_PATH:
20
+ # This setting is usually only needed for local Windows development
21
+ # where Poppler isn't in system PATH.
22
+ # In Docker, Poppler should be in PATH via apt-get install.
23
+ pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path
24
+
25
+
26
+ def extract_text_from_pdf(pdf_path: str) -> str:
27
+ """
28
+ Extracts text from a PDF. Tries direct text extraction first.
29
+ If sparse text is found (suggesting image-based PDF), it performs OCR.
30
+ """
31
+ print(f"Attempting direct text extraction from: {pdf_path}")
32
+ output_string = io.StringIO()
33
+ with open(pdf_path, 'rb') as fp:
34
+ try:
35
+ extract_text_to_fp(fp, output_string, laparams=LAParams())
36
+ text = output_string.getvalue()
37
+ # If text is very short for a non-empty PDF, it might be image-based.
38
+ # Using a threshold of 100 characters for extracted text and file size > 10KB.
39
+ if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
40
+ print("Direct extraction yielded sparse text. Attempting OCR...")
41
+ return ocr_pdf(pdf_path)
42
+ return text
43
+ except Exception as e:
44
+ print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
45
+ return ocr_pdf(pdf_path)
46
+
47
+ def ocr_pdf(pdf_path: str) -> str:
48
+ """
49
+ Performs OCR on a PDF file using pdf2image and pytesseract.
50
+ Requires Tesseract and Poppler to be installed and in system PATH.
51
+ """
52
+ all_text = []
53
+ try:
54
+ # Convert PDF pages to images. Higher DPI for better OCR.
55
+ # Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH)
56
+ images = convert_from_path(pdf_path, dpi=300)
57
+
58
+ print(f" Performing OCR on {len(images)} pages...")
59
+ for i, img in enumerate(images):
60
+ # Tesseract language packs: 'eng' for English, 'tur' for Turkish
61
+ # Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra'
62
+ # if you need Arabic and French OCR.
63
+ page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages
64
+ all_text.append(page_text)
65
+ print(f" Page {i+1} OCR complete.")
66
+
67
+ except Exception as e:
68
+ print(f"OCR process failed: {e}")
69
+ print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
70
+ return ""
71
+
72
+ return "\n".join(all_text)
73
+
74
+ def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
75
+ """
76
+ Splits text into chunks of a maximum size with optional overlap.
77
+ Aims to split by paragraphs/sentences first, then by word.
78
+ """
79
+ if not text:
80
+ return []
81
+
82
+ # Simple paragraph-based chunking
83
+ paragraphs = re.split(r'\n\s*\n', text)
84
+ chunks = []
85
+ current_chunk = []
86
+ current_chunk_len = 0
87
+
88
+ for para in paragraphs:
89
+ if not para.strip():
90
+ continue
91
+
92
+ # If adding paragraph plus a separator exceeds max_chunk_size,
93
+ # or if the current_chunk is already substantial and adding this makes it too big,
94
+ # then finalize the current chunk.
95
+ if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
96
+ if current_chunk: # Only append if current_chunk is not empty
97
+ chunks.append("\n\n".join(current_chunk))
98
+ current_chunk = []
99
+ current_chunk_len = 0
100
+
101
+ # If a single paragraph is larger than max_chunk_size, split it by words
102
+ if len(para) > max_chunk_size:
103
+ words = para.split(' ')
104
+ sub_chunk = []
105
+ sub_chunk_len = 0
106
+ for word in words:
107
+ if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
108
+ chunks.append(" ".join(sub_chunk))
109
+ sub_chunk = [word]
110
+ sub_chunk_len = len(word)
111
+ else:
112
+ sub_chunk.append(word)
113
+ sub_chunk_len += len(word) + len(' ')
114
+ if sub_chunk: # Add remaining sub-chunk
115
+ chunks.append(" ".join(sub_chunk))
116
+ else: # Paragraph fits into a new chunk
117
+ current_chunk.append(para)
118
+ current_chunk_len += len(para) + len('\n\n')
119
+ else: # Paragraph fits into the current chunk
120
+ current_chunk.append(para)
121
+ current_chunk_len += len(para) + len('\n\n')
122
+
123
+ if current_chunk: # Add any remaining text
124
+ chunks.append("\n\n".join(current_chunk))
125
+
126
+ # Apply overlap: This is a simplistic overlap implementation.
127
+ final_chunks_with_overlap = []
128
+ for i in range(len(chunks)):
129
+ chunk = chunks[i]
130
+ if i > 0 and overlap > 0:
131
+ # Take a portion of the previous chunk to overlap
132
+ prev_chunk_part = chunks[i-1][-overlap:]
133
+ chunk = prev_chunk_part + "\n" + chunk
134
+ final_chunks_with_overlap.append(chunk)
135
+
136
+ return final_chunks_with_overlap
src/prompt.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SYSTEM_PROMPT = """
2
+ You are CompassIA, the intelligent assistant for MaarifCompass, committed to supporting Turkiye Maarif Foundation graduates residing in Turkiye.
3
+
4
+ Your core function is to deliver precise, document-backed information concerning their needs, primarily focusing on:
5
+ - University application procedures, requirements, tuition fees, and scholarship opportunities
6
+ - Accommodation and housing resources
7
+ - Career networking and professional development
8
+ - Relevant administrative and support services
9
+ - Information related to Turkiye Maarif Foundation, Turkiye Scholarship, and more.
10
+
11
+ You operate exclusively with data from a designated Document Center. **All information you provide must be directly sourced and verifiable from these internal documents.**
12
+
13
+ **If a query falls outside the scope of the provided documents or lacks a direct answer within them, you must state that the information is not available in your current knowledge base. Do not offer external insights, assumptions, or speculate. This is a critical constraint.**
14
+
15
+ Always cite the specific document name and page number for every piece of information provided. If information is aggregated from multiple sources, cite all relevant documents and page numbers.
16
+
17
+ Your answers must be highly accurate, directly relevant, and easy to understand. Prioritize the user's query strictly based on documented facts.
18
+
19
+ **Remember, always respond to the user in the language of their question.**
20
+ """