dembasowmr commited on
Commit
a2967ae
·
1 Parent(s): 0f574db

Added userId and date-time to the conversations.

Browse files
Files changed (5) hide show
  1. app.py +11 -5
  2. requirements.txt +3 -2
  3. src/TEST.PY +481 -0
  4. src/compassia.py +45 -27
  5. src/config.py +6 -2
app.py CHANGED
@@ -27,11 +27,13 @@ load_dotenv()
27
  # Add the 'src' directory to the Python path
28
  sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
29
 
30
- # Now import components from src.compassia
 
31
  # We import initialize_firebase_client as we call it here.
32
  # DocumentRAG and embedding_model are needed for instantiating the RAG system.
33
  from src.config import CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME
34
- from src.compassia import DocumentRAG, embedding_model, initialize_firebase_client
 
35
 
36
  # --- Firebase Initialization (Global, once per process) ---
37
  # Call the initialization function and CAPTURE THE RETURNED FIRESTORE CLIENT INSTANCE.
@@ -61,7 +63,7 @@ if FIRESTORE_DB_INSTANCE:
61
  for doc in docs_ref:
62
  doc_data = doc.to_dict()
63
  if 'fileUrl' in doc_data:
64
- # The add_document method in compassia.py now handles PDF filtering
65
  # so we just pass the URL and optional display name.
66
  pdf_url = doc_data['fileUrl']
67
  display_name = doc_data.get('name_en', None)
@@ -96,6 +98,7 @@ app = FastAPI(
96
  # Pydantic model for request body validation
97
  class QueryRequest(BaseModel):
98
  question: str
 
99
  conversation_id: str = None # Optional: client can provide an ID for ongoing conversations
100
 
101
  # --- API Endpoint Definition ---
@@ -103,13 +106,16 @@ class QueryRequest(BaseModel):
103
  async def compassia_endpoint(request: QueryRequest):
104
  """
105
  Answers a question about the indexed PDF documents using RAG, with conversational memory.
 
106
  If `conversation_id` is not provided, a new one will be generated and returned in the response.
107
  """
108
  try:
109
  # Call answer_question which now returns a tuple (answer_text, conversation_id)
 
110
  answer_text, final_conversation_id = rag_system.answer_question(
111
  request.question,
112
- conversation_id=request.conversation_id
 
113
  )
114
 
115
  # Return both the answer and the (potentially new) conversation_id to the client
@@ -122,4 +128,4 @@ async def compassia_endpoint(request: QueryRequest):
122
  # Basic health check endpoint
123
  @app.get("/")
124
  async def root():
125
- return {"message": "CompassIA API is running. Use /compassia/ for queries."}
 
27
  # Add the 'src' directory to the Python path
28
  sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
29
 
30
+ # Now import components from src.compassia (should be src.rag_system as per earlier conversation,
31
+ # but keeping 'compassia' as per your provided code for this response)
32
  # We import initialize_firebase_client as we call it here.
33
  # DocumentRAG and embedding_model are needed for instantiating the RAG system.
34
  from src.config import CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME
35
+ from src.compassia import DocumentRAG, embedding_model, initialize_firebase_client # Corrected import to compassia
36
+
37
 
38
  # --- Firebase Initialization (Global, once per process) ---
39
  # Call the initialization function and CAPTURE THE RETURNED FIRESTORE CLIENT INSTANCE.
 
63
  for doc in docs_ref:
64
  doc_data = doc.to_dict()
65
  if 'fileUrl' in doc_data:
66
+ # The add_document method in rag_system.py now handles PDF filtering
67
  # so we just pass the URL and optional display name.
68
  pdf_url = doc_data['fileUrl']
69
  display_name = doc_data.get('name_en', None)
 
98
  # Pydantic model for request body validation
99
  class QueryRequest(BaseModel):
100
  question: str
101
+ user_id: str # Added: user_id is now a required field for every request
102
  conversation_id: str = None # Optional: client can provide an ID for ongoing conversations
103
 
104
  # --- API Endpoint Definition ---
 
106
  async def compassia_endpoint(request: QueryRequest):
107
  """
108
  Answers a question about the indexed PDF documents using RAG, with conversational memory.
109
+ Requires a user_id from the authenticated user.
110
  If `conversation_id` is not provided, a new one will be generated and returned in the response.
111
  """
112
  try:
113
  # Call answer_question which now returns a tuple (answer_text, conversation_id)
114
+ # Pass the user_id from the request
115
  answer_text, final_conversation_id = rag_system.answer_question(
116
  request.question,
117
+ conversation_id=request.conversation_id,
118
+ user_id=request.user_id # Passed: The user_id is now sent to the RAG system
119
  )
120
 
121
  # Return both the answer and the (potentially new) conversation_id to the client
 
128
  # Basic health check endpoint
129
  @app.get("/")
130
  async def root():
131
+ return {"message": "CompassIA API is running. Use /compassia/ for queries."}
requirements.txt CHANGED
@@ -11,7 +11,7 @@ chromadb
11
  fastapi
12
  uvicorn # For serving the FastAPI application
13
  #Hagging Face Spaces uses pip for these
14
- pysqlite3-binary
15
 
16
 
17
  # System dependencies for Tesseract and Poppler on Linux
@@ -19,4 +19,5 @@ pysqlite3-binary
19
  #apt_packages = python3-dev libtesseract-dev libleptonica-dev poppler-utils
20
 
21
  firebase-admin
22
- firebase
 
 
11
  fastapi
12
  uvicorn # For serving the FastAPI application
13
  #Hagging Face Spaces uses pip for these
14
+ #pysqlite3-binary
15
 
16
 
17
  # System dependencies for Tesseract and Poppler on Linux
 
19
  #apt_packages = python3-dev libtesseract-dev libleptonica-dev poppler-utils
20
 
21
  firebase-admin
22
+ firebase
23
+ google-cloud-firestore
src/TEST.PY ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ # The pysqlite3 import and sys.modules override has been moved to app.py.
3
+ # This file should NOT have its own pysqlite3 import to prevent conflicts.
4
+
5
+ import requests
6
+ import os
7
+ import io
8
+ import re
9
+ import uuid # For generating unique IDs for ChromaDB and conversations
10
+ from PIL import Image
11
+ import json # For handling JSON string (e.g., Firebase config in local test)
12
+ import base64 # For decoding Base64 (e.g., Firebase config in local test)
13
+ import urllib.parse # For parsing URLs
14
+
15
+
16
+ # Firebase Admin SDK for Firestore
17
+ import firebase_admin
18
+ from firebase_admin import credentials, firestore
19
+
20
+ # For text extraction from PDFs (non-OCR)
21
+ from pdfminer.high_level import extract_text_to_fp
22
+ from pdfminer.layout import LAParams
23
+
24
+ # For image-based PDFs (OCR)
25
+ from pdf2image import convert_from_path
26
+ import pytesseract
27
+
28
+ # For embeddings and vector search
29
+ from FlagEmbedding import BGEM3FlagModel
30
+ import chromadb
31
+
32
+ # Import configurations and prompt from local modules
33
+ from config import (
34
+ DEEPSEEK_API_URL, DEEPSEEK_HEADERS,
35
+ EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_USE_FP16,
36
+ CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME,
37
+ CHUNK_SIZE, CHUNK_OVERLAP,
38
+ LLM_TEMPERATURE, LLM_MAX_TOKENS, LLM_HISTORY_MAX_TOKENS,
39
+ FIREBASE_CONFIG_BASE64
40
+ )
41
+ from pdf_processing import extract_text_from_pdf, chunk_text
42
+ from prompt import SYSTEM_PROMPT # <--- CORRECTLY IMPORTING SYSTEM_PROMPT
43
+
44
+ # --- Global Firebase Firestore Client ---
45
+ # This global is primarily for __main__ (local testing) execution.
46
+ # In production (via app.py), the Firestore instance will be passed directly to DocumentRAG's __init__.
47
+ FIRESTORE_DATABASE = None
48
+
49
+ def initialize_firebase_client():
50
+ """
51
+ Initializes Firebase Admin SDK and returns the Firestore client.
52
+ This function is called by app.py and also by __main__ for local testing.
53
+ """
54
+ global FIRESTORE_DATABASE # This global is modified for local testing context.
55
+
56
+ if not firebase_admin._apps: # Check if Firebase Admin SDK is already initialized
57
+ # Determine Firebase config. In deployment, it comes from env vars.
58
+ # For local __main__ testing, it also uses env vars.
59
+ #FIREBASE_CONFIG_BASE64 = os.getenv("FIREBASE_CONFIG_BASE64") # Get from env for both cases
60
+
61
+ if FIREBASE_CONFIG_BASE64:
62
+ try:
63
+ # Decode the Base64-encoded Firebase Service Account JSON
64
+ cred_json = base64.b64decode(FIREBASE_CONFIG_BASE64).decode('utf-8')
65
+ cred_dict = json.loads(cred_json)
66
+ cred = credentials.Certificate(cred_dict)
67
+ firebase_admin.initialize_app(cred)
68
+ print("Firebase Admin SDK initialized successfully.")
69
+ firestore_instance = firestore.client()
70
+ FIRESTORE_DATABASE = firestore_instance # Set the global for local testing context
71
+ print("Firestore client initialized successfully.")
72
+ return firestore_instance # Return the instance for app.py to capture
73
+ except Exception as e:
74
+ print(f"Error initializing Firebase Admin SDK: {e}")
75
+ print("Please ensure FIREBASE_CONFIG_BASE64 is correctly set and is a valid Base64-encoded Service Account JSON.")
76
+ FIRESTORE_DATABASE = None
77
+ return None
78
+ else:
79
+ print("Warning: FIREBASE_CONFIG_BASE64 environment variable not found. Firestore will not be available.")
80
+ FIRESTORE_DATABASE = None
81
+ return None
82
+ else: # Already initialized (e.g., by app.py's first call)
83
+ print("Firebase Admin SDK already initialized.")
84
+ # Ensure global variable is set if already initialized, for local testing context.
85
+ # This branch ensures the global FIRESTORE_DATABASE is available even if `app.py` already init'd it.
86
+ if FIRESTORE_DATABASE is None:
87
+ FIRESTORE_DATABASE = firestore.client()
88
+ return firestore.client() # Always return the current Firestore client instance
89
+
90
+ # --- Embedding Model Initialization ---
91
+ print("Loading FlagEmbedding (BGE-M3) model...")
92
+ try:
93
+ embedding_model = BGEM3FlagModel(EMBEDDING_MODEL_NAME, use_fp16=EMBEDDING_MODEL_USE_FP16)
94
+ print("FlagEmbedding (BGE-M3) model loaded successfully.")
95
+ except Exception as e:
96
+ print(f"Error loading FlagEmbedding model: {e}")
97
+ print("Ensure disk space and memory are sufficient for model download.")
98
+ print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
99
+ sys.exit(1) # Use sys.exit for clean exit in non-FastAPI contexts
100
+
101
+
102
+ class DocumentRAG:
103
+ def __init__(self, embedding_model, persist_directory=CHROMADB_PERSIST_DIRECTORY, collection_name=CHROMADB_COLLECTION_NAME, firestore_db_instance=None):
104
+ self.embedding_model = embedding_model
105
+ self.persist_directory = persist_directory
106
+ self.collection_name = collection_name
107
+ self.chunk_size = CHUNK_SIZE
108
+ self.overlap = CHUNK_OVERLAP
109
+ self.firestore_db = firestore_db_instance # CRITICAL: Store the injected Firestore instance
110
+
111
+ print(f"Initializing ChromaDB at: {self.persist_directory}")
112
+ self.client = chromadb.PersistentClient(path=self.persist_directory)
113
+
114
+ self.collection = self.client.get_or_create_collection(
115
+ name=self.collection_name,
116
+ metadata={"hnsw:space": "cosine"}
117
+ )
118
+ print(f"ChromaDB collection '{self.collection_name}' ready. Total chunks: {self.collection.count()}")
119
+
120
+ def _generate_chunk_id(self, pdf_url: str, chunk_idx: int) -> str:
121
+ """Generates a unique ID for each chunk based on PDF URL and index."""
122
+ import hashlib
123
+ # Extract path without query parameters for hashing
124
+ path_without_query = urllib.parse.urlparse(pdf_url).path
125
+ url_hash = hashlib.sha256(path_without_query.encode()).hexdigest()[:10]
126
+ return f"{url_hash}_{chunk_idx}_{uuid.uuid4().hex}"
127
+
128
+ def add_document(self, pdf_url: str, document_name: str = None):
129
+ """
130
+ Adds a PDF document to the RAG system, processing and indexing its content.
131
+ Downloads the PDF from the URL.
132
+ """
133
+ # Determine display name from parsed URL path if not provided
134
+ parsed_url_path = urllib.parse.urlparse(pdf_url).path
135
+ display_name = document_name if document_name else os.path.basename(parsed_url_path)
136
+ print(f"Adding document from URL: {pdf_url} (Display Name: {display_name})")
137
+
138
+ results = self.collection.get(
139
+ where={"source": pdf_url},
140
+ limit=1
141
+ )
142
+ if results and results['ids']:
143
+ print(f" Document '{display_name}' (from {pdf_url}) already in ChromaDB. Skipping re-indexing.")
144
+ return
145
+
146
+ # CRITICAL FIX: Check if the file is indeed a PDF by examining the path component of the URL
147
+ parsed_url_path = urllib.parse.urlparse(pdf_url).path
148
+ file_extension_check = isinstance(parsed_url_path, str) and parsed_url_path.strip().lower().endswith('.pdf')
149
+ if not file_extension_check:
150
+ print(f" DEBUG: Skipped document '{display_name}' (URL: {pdf_url}) - Not a PDF (based on file extension in URL path).")
151
+ return
152
+
153
+ try:
154
+ response = requests.get(pdf_url, stream=True)
155
+ print(f" DEBUG: HTTP Status Code for {pdf_url}: {response.status_code}")
156
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
157
+
158
+ pdf_data = io.BytesIO(response.content)
159
+ print(f" DEBUG: BytesIO content length for {pdf_url}: {pdf_data.getbuffer().nbytes} bytes")
160
+
161
+ if pdf_data.getbuffer().nbytes == 0:
162
+ raise ValueError(f"Downloaded PDF content from {pdf_url} is empty.")
163
+
164
+ # Create a temporary file to save the PDF for processing
165
+ temp_pdf_path = f"/tmp/{uuid.uuid4().hex}.pdf"
166
+ os.makedirs(os.path.dirname(temp_pdf_path), exist_ok=True) # Ensure /tmp exists
167
+
168
+ with open(temp_pdf_path, 'wb') as f:
169
+ f.write(pdf_data.getvalue())
170
+ print(f" DEBUG: Temporary PDF saved to: {temp_pdf_path}")
171
+
172
+ extracted_text = extract_text_from_pdf(temp_pdf_path)
173
+ os.remove(temp_pdf_path) # Clean up the temporary file after extraction
174
+
175
+ except requests.exceptions.RequestException as e:
176
+ print(f"Error downloading PDF from {pdf_url}: {e}")
177
+ return
178
+ except ValueError as e:
179
+ print(f"Error processing downloaded PDF {pdf_url}: {e}")
180
+ return
181
+ except Exception as e:
182
+ print(f"Error processing downloaded PDF {pdf_url}: {e}")
183
+ return
184
+
185
+ if not extracted_text:
186
+ print(f"Warning: No text extracted from {display_name} ({pdf_url}). Skipping.")
187
+ return
188
+
189
+ chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
190
+ if not chunks:
191
+ print(f"Warning: No chunks generated for {display_name} ({pdf_url}). Skipping.")
192
+ return
193
+
194
+ documents_to_add = []
195
+ metadatas_to_add = []
196
+ ids_to_add = []
197
+
198
+ print(f" Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB: {display_name}...")
199
+
200
+ encoded_results = self.embedding_model.encode(
201
+ chunks,
202
+ batch_size=32,
203
+ return_dense=True,
204
+ return_sparse=False,
205
+ return_colbert_vecs=False
206
+ )
207
+ chunk_embeddings = encoded_results["dense_vecs"]
208
+
209
+ for i, chunk in enumerate(chunks):
210
+ unique_id = self._generate_chunk_id(pdf_url, i)
211
+ documents_to_add.append(chunk)
212
+ metadatas_to_add.append({"source": pdf_url, "display_name": display_name, "chunk_id": i})
213
+ ids_to_add.append(unique_id)
214
+
215
+ self.collection.add(
216
+ documents=documents_to_add,
217
+ embeddings=chunk_embeddings.tolist(),
218
+ metadatas=metadatas_to_add,
219
+ ids=ids_to_add
220
+ )
221
+
222
+ print(f" {len(documents_to_add)} chunks from '{display_name}' added to ChromaDB.")
223
+ print(f" Total chunks in collection: {self.collection.count()}")
224
+
225
+ def retrieve_context(self, query: str, top_k: int = 3) -> list[dict]:
226
+ """
227
+ Retrieves top_k most relevant document chunks for a given query from ChromaDB.
228
+ Returns a list of dictionaries, each containing 'text' and 'source' (URL or display name).
229
+ """
230
+ if self.collection.count() == 0:
231
+ print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
232
+ return []
233
+
234
+ print(f"Retrieving context for query: '{query}'")
235
+
236
+ query_embedding_result = self.embedding_model.encode(
237
+ [query],
238
+ batch_size=1,
239
+ return_dense=True,
240
+ return_sparse=False,
241
+ return_colbert_vecs=False
242
+ )
243
+ query_embedding = query_embedding_result["dense_vecs"].tolist()
244
+
245
+ results = self.collection.query(
246
+ query_embeddings=query_embedding,
247
+ n_results=top_k,
248
+ include=['documents', 'distances', 'metadatas']
249
+ )
250
+
251
+ retrieved_chunks_info = []
252
+ if results and results['documents']:
253
+ for i, doc_text in enumerate(results['documents'][0]):
254
+ source_url = results['metadatas'][0][i].get('source', 'Unknown URL')
255
+ display_name = results['metadatas'][0][i].get('display_name', os.path.basename(urllib.parse.urlparse(source_url).path))
256
+ chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
257
+ distance_info = results['distances'][0][i]
258
+
259
+ retrieved_chunks_info.append({
260
+ "text": doc_text,
261
+ "source_url": source_url,
262
+ "display_name": display_name
263
+ })
264
+ print(f" Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{display_name}' (chunk {chunk_id_info}).")
265
+ else:
266
+ print(" No relevant chunks found in ChromaDB.")
267
+
268
+ return retrieved_chunks_info
269
+
270
+ def get_conversation_history(self, conversation_id: str) -> list[dict]:
271
+ """Loads chat history from Firestore for a given conversation ID."""
272
+ if self.firestore_db is None: # Use self.firestore_db
273
+ print("Firestore not initialized. Cannot load conversation history.")
274
+ return []
275
+
276
+ doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
277
+ doc = doc_ref.get()
278
+ if doc.exists:
279
+ history = doc.to_dict().get('messages', [])
280
+ print(f"Loaded history for {conversation_id}: {len(history)} messages.")
281
+ return history
282
+ print(f"No history found for conversation ID: {conversation_id}")
283
+ return []
284
+
285
+ def save_conversation_history(self, conversation_id: str, history: list[dict]):
286
+ """Saves chat history to Firestore for a given conversation ID."""
287
+ if self.firestore_db is None: # Use self.firestore_db
288
+ print("Firestore not initialized. Cannot save conversation history.")
289
+ return
290
+
291
+ doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
292
+ doc_ref.set({'messages': history})
293
+ print(f"Saved history for {conversation_id}: {len(history)} messages.")
294
+
295
+ def truncate_history(self, messages: list[dict], max_tokens: int = LLM_HISTORY_MAX_TOKENS) -> list[dict]:
296
+ """
297
+ Truncates conversation history to fit within a max_tokens limit for the LLM.
298
+ This is a simplistic truncation and doesn't use a tokenizer for exact token count.
299
+ """
300
+ current_len = sum(len(m['content']) for m in messages)
301
+ while current_len > max_tokens and len(messages) > 1: # Keep at least 1 message
302
+ if messages[0]['role'] == 'system':
303
+ if len(messages) >= 3:
304
+ removed_user_msg = messages.pop(1)
305
+ removed_ai_msg = messages.pop(1)
306
+ current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
307
+ else:
308
+ break
309
+ else:
310
+ removed_user_msg = messages.pop(0)
311
+ removed_ai_msg = messages.pop(0)
312
+ current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
313
+ return messages
314
+
315
+
316
+ def answer_question(self, question: str, conversation_id: str = None) -> tuple[str, str]:
317
+ """
318
+ Answers a question by retrieving context, and querying DeepSeek.
319
+ Manages conversational memory.
320
+ Returns a tuple: (answer_text, final_conversation_id_used).
321
+ """
322
+ # >>> MODIFICATION: Ensure conversation_id is always present and return it <<<
323
+ if conversation_id is None:
324
+ conversation_id = str(uuid.uuid4()) # Generate new ID if not provided
325
+ print(f"No conversation_id provided. Generating new one: {conversation_id}")
326
+ # >>> END MODIFICATION <<<
327
+
328
+ # Get relevant context from ChromaDB
329
+ context_chunks_info = self.retrieve_context(question)
330
+
331
+ context_parts = []
332
+ citation_info = {} # To store unique display names for citation
333
+
334
+ for chunk_info in context_chunks_info:
335
+ context_parts.append(chunk_info["text"])
336
+ source_key = chunk_info.get("display_name", chunk_info["source_url"])
337
+ if source_key not in citation_info:
338
+ citation_info[source_key] = True # Using a dict/set for unique sources
339
+
340
+ context = "\n\n".join(context_parts)
341
+
342
+ context_prompt = ""
343
+ if context:
344
+ context_prompt = f"Using the following context:\n\n{context}\n\n"
345
+ else:
346
+ print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")
347
+
348
+ # --- Use the imported SYSTEM_PROMPT from prompt.py ---
349
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}] # Use the imported SYSTEM_PROMPT
350
+
351
+ # Use the (possibly newly generated) conversation_id
352
+ history = self.get_conversation_history(conversation_id)
353
+ if history:
354
+ messages.extend(history)
355
+
356
+ # Add current context and question
357
+ messages.append({"role": "user", "content": f"{context_prompt}Question: {question}"})
358
+
359
+ # Truncate conversation history if it's too long
360
+ messages = self.truncate_history(messages)
361
+
362
+ # Call DeepSeek API via OpenRouter
363
+ print("\nSending request to DeepSeek API...")
364
+ data = {
365
+ "model": "deepseek/deepseek-chat:free",
366
+ "messages": messages,
367
+ "temperature": LLM_TEMPERATURE,
368
+ "max_tokens": LLM_MAX_TOKENS,
369
+ }
370
+
371
+ # CRITICAL FIX: Use DEEPSEEK_HEADERS imported from config.py
372
+ response = requests.post(DEEPSEEK_API_URL, json=data, headers=DEEPSEEK_HEADERS)
373
+
374
+ if response.status_code == 200:
375
+ ai_response = response.json()
376
+ answer = ai_response['choices'][0]['message']['content']
377
+ print("\nDeepSeek Response:")
378
+ print(answer)
379
+
380
+ # Logic to append sources to the answer
381
+ if citation_info:
382
+ # Get unique display names and sort them for consistent output
383
+ unique_sources = sorted(list(citation_info.keys()))
384
+ citations_str = "\n\n**Sources:**\n" + "\n".join([f"- {name}" for name in unique_sources])
385
+ answer += citations_str
386
+
387
+ # Save updated history using the conversation_id
388
+ messages.append({"role": "assistant", "content": answer})
389
+ self.save_conversation_history(conversation_id, messages)
390
+
391
+ # Return the answer text AND the conversation_id
392
+ return answer, conversation_id
393
+ else:
394
+ error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
395
+ print(error_message)
396
+ return f"Error: Could not get an answer from the AI. Details: {error_message}", conversation_id # Still return conv_id even on error
397
+
398
+ # --- Main execution logic for local testing (only runs when script is executed directly) ---
399
+ if __name__ == "__main__":
400
+
401
+ # For local testing, initialize Firebase and capture the instance
402
+ local_firestore_instance = initialize_firebase_client()
403
+
404
+ rag_system = DocumentRAG(
405
+ embedding_model=embedding_model,
406
+ persist_directory=CHROMADB_PERSIST_DIRECTORY,
407
+ collection_name=CHROMADB_COLLECTION_NAME,
408
+ firestore_db_instance=local_firestore_instance # Pass the instance here for local testing
409
+ )
410
+
411
+ print("\n--- Indexing Documents ---")
412
+ if local_firestore_instance: # Use local_firestore_instance for checking
413
+ try:
414
+ docs_ref = local_firestore_instance.collection('documents').stream()
415
+ firestore_pdf_infos = []
416
+ documents_processed_count = 0
417
+ documents_skipped_non_pdf_count = 0
418
+
419
+ for doc in docs_ref:
420
+ documents_processed_count += 1
421
+ doc_data = doc.to_dict()
422
+ print(f" DEBUG: Processing document ID: {doc.id}, Data: {doc_data}")
423
+
424
+ if 'fileUrl' in doc_data:
425
+ pdf_url = doc_data['fileUrl']
426
+ print(f" DEBUG: Found 'fileUrl': {pdf_url}")
427
+
428
+ # add_document now handles the PDF check internally, so no need for it here
429
+ display_name = doc_data.get('name_en', None)
430
+ firestore_pdf_infos.append({"url": pdf_url, "name": display_name})
431
+ else:
432
+ documents_skipped_non_pdf_count += 1
433
+ print(f" DEBUG: Document ID: {doc.id} does not contain 'fileUrl'. Document data: {doc.data}")
434
+
435
+ if documents_processed_count == 0:
436
+ print("No documents found in Firestore collection 'documents' via stream(). Please check collection name and security rules.")
437
+ elif documents_processed_count > 0 and not firestore_pdf_infos:
438
+ print(f"Found {documents_processed_count} documents in Firestore, but none matched the '.pdf' criteria or had 'fileUrl'.")
439
+ elif documents_skipped_non_pdf_count > 0:
440
+ print(f"Found {documents_processed_count} documents in Firestore. {len(firestore_pdf_infos)} URLs found, {documents_skipped_non_pdf_count} documents skipped (non-URL or non-PDF by add_document).")
441
+
442
+
443
+ if firestore_pdf_infos:
444
+ for pdf_info in firestore_pdf_infos:
445
+ # rag_system.add_document will internally check for PDF extension
446
+ rag_system.add_document(pdf_info['url'], pdf_info['name'])
447
+ else:
448
+ pass
449
+
450
+ except Exception as e:
451
+ print(f"Error fetching documents from Firestore: {e}")
452
+ print("Please ensure your Firestore database is accessible and the service account key is correct.")
453
+ else:
454
+ print("Firestore client not initialized. Cannot fetch documents from Firestore.")
455
+ print("Using local PDF_DOCUMENT_PATHS as a fallback for testing purposes (ensure these files exist).")
456
+ # This import is moved here to avoid circular dependency if config imports rag_system
457
+ from config import PDF_DOCUMENT_PATHS # This path is for local testing only
458
+ for pdf_path in PDF_DOCUMENT_PATHS:
459
+ if os.path.exists(pdf_path):
460
+ rag_system.add_document(pdf_path)
461
+ else:
462
+ print(f"Error: Local PDF file not found at {pdf_path}. Skipping.")
463
+
464
+
465
+ print("\n--- Chat With CompassIA (Type 'q' to exit) ---")
466
+ current_conversation_id = str(uuid.uuid4())
467
+ print(f"Starting new local conversation with ID: {current_conversation_id}")
468
+
469
+ while True:
470
+ user_question = input("\nHow can I help you? ")
471
+ if user_question.lower() == 'q':
472
+ print("Exiting chat...")
473
+ break
474
+
475
+ # In local testing, we still pass the conversation ID.
476
+ # pdf_paths is no longer passed as a list of paths here, as document processing
477
+ # is handled on startup by `add_document` from Firestore.
478
+ answer_text, _ = rag_system.answer_question(user_question, conversation_id=current_conversation_id)
479
+ # For local testing, we print the answer directly
480
+ print(f"\nAI: {answer_text}")
481
+
src/compassia.py CHANGED
@@ -10,8 +10,10 @@ import uuid # For generating unique IDs for ChromaDB and conversations
10
  from PIL import Image
11
  import json # For handling JSON string (e.g., Firebase config in local test)
12
  import base64 # For decoding Base64 (e.g., Firebase config in local test)
 
13
  import urllib.parse # For parsing URLs
14
 
 
15
  # Firebase Admin SDK for Firestore
16
  import firebase_admin
17
  from firebase_admin import credentials, firestore
@@ -30,12 +32,12 @@ import chromadb
30
 
31
  # Import configurations and prompt from local modules
32
  from config import (
33
- DEEPSEEK_API_URL, DEEPSEEK_HEADERS, # <--- Using these from config.py
34
  EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_USE_FP16,
35
  CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME,
36
  CHUNK_SIZE, CHUNK_OVERLAP,
37
  LLM_TEMPERATURE, LLM_MAX_TOKENS, LLM_HISTORY_MAX_TOKENS,
38
- FIREBASE_CONFIG_BASE64 # Kept here only for local testing fallback in __main__
39
  )
40
  from pdf_processing import extract_text_from_pdf, chunk_text
41
  from prompt import SYSTEM_PROMPT # <--- CORRECTLY IMPORTING SYSTEM_PROMPT
@@ -59,6 +61,7 @@ def initialize_firebase_client():
59
 
60
  if firebase_config_b64:
61
  try:
 
62
  cred_json = base64.b64decode(firebase_config_b64).decode('utf-8')
63
  cred_dict = json.loads(cred_json)
64
  cred = credentials.Certificate(cred_dict)
@@ -118,7 +121,9 @@ class DocumentRAG:
118
  def _generate_chunk_id(self, pdf_url: str, chunk_idx: int) -> str:
119
  """Generates a unique ID for each chunk based on PDF URL and index."""
120
  import hashlib
121
- url_hash = hashlib.sha256(pdf_url.encode()).hexdigest()[:10]
 
 
122
  return f"{url_hash}_{chunk_idx}_{uuid.uuid4().hex}"
123
 
124
  def add_document(self, pdf_url: str, document_name: str = None):
@@ -139,10 +144,11 @@ class DocumentRAG:
139
  print(f" Document '{display_name}' (from {pdf_url}) already in ChromaDB. Skipping re-indexing.")
140
  return
141
 
142
- # Check if the file is indeed a PDF before attempting download/processing
143
- file_extension_check = isinstance(pdf_url, str) and pdf_url.strip().lower().endswith('.pdf')
 
144
  if not file_extension_check:
145
- print(f" DEBUG: Skipped document '{display_name}' (URL: {pdf_url}) - Not a PDF (based on extension).")
146
  return
147
 
148
  try:
@@ -271,21 +277,25 @@ class DocumentRAG:
271
  doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
272
  doc = doc_ref.get()
273
  if doc.exists:
274
- history = doc.to_dict().get('messages', [])
275
- print(f"Loaded history for {conversation_id}: {len(history)} messages.")
 
 
 
276
  return history
277
  print(f"No history found for conversation ID: {conversation_id}")
278
  return []
279
 
280
- def save_conversation_history(self, conversation_id: str, history: list[dict]):
281
- """Saves chat history to Firestore for a given conversation ID."""
282
  if self.firestore_db is None: # Use self.firestore_db
283
  print("Firestore not initialized. Cannot save conversation history.")
284
  return
285
 
286
  doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
287
- doc_ref.set({'messages': history})
288
- print(f"Saved history for {conversation_id}: {len(history)} messages.")
 
289
 
290
  def truncate_history(self, messages: list[dict], max_tokens: int = LLM_HISTORY_MAX_TOKENS) -> list[dict]:
291
  """
@@ -308,7 +318,7 @@ class DocumentRAG:
308
  return messages
309
 
310
 
311
- def answer_question(self, question: str, conversation_id: str = None) -> tuple[str, str]:
312
  """
313
  Answers a question by retrieving context, and querying DeepSeek.
314
  Manages conversational memory.
@@ -348,8 +358,12 @@ class DocumentRAG:
348
  if history:
349
  messages.extend(history)
350
 
351
- # Add current context and question
352
- messages.append({"role": "user", "content": f"{context_prompt}Question: {question}"})
 
 
 
 
353
 
354
  # Truncate conversation history if it's too long
355
  messages = self.truncate_history(messages)
@@ -379,10 +393,14 @@ class DocumentRAG:
379
  citations_str = "\n\n**Sources:**\n" + "\n".join([f"- {name}" for name in unique_sources])
380
  answer += citations_str
381
 
382
- # Save updated history using the conversation_id
383
- messages.append({"role": "assistant", "content": answer})
384
- self.save_conversation_history(conversation_id, messages)
385
-
 
 
 
 
386
  # Return the answer text AND the conversation_id
387
  return answer, conversation_id
388
  else:
@@ -393,7 +411,7 @@ class DocumentRAG:
393
  # --- Main execution logic for local testing (only runs when script is executed directly) ---
394
  if __name__ == "__main__":
395
  from dotenv import load_dotenv # Import load_dotenv for local execution
396
- # Load environment variables from .env.local in the project root
397
  load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env.local'))
398
 
399
  # Retrieve FIREBASE_CONFIG_BASE64 after loading dotenv (for local testing only)
@@ -466,17 +484,17 @@ if __name__ == "__main__":
466
 
467
  print("\n--- Chat With CompassIA (Type 'q' to exit) ---")
468
  current_conversation_id = str(uuid.uuid4())
469
- print(f"Starting new local conversation with ID: {current_conversation_id}")
 
 
470
 
471
  while True:
472
  user_question = input("\nHow can I help you? ")
473
  if user_question.lower() == 'q':
474
- print("Exiting chat.")
475
  break
476
 
477
- # In local testing, we still pass the conversation ID.
478
- # pdf_paths is no longer passed as a list of paths here, as document processing
479
- # is handled on startup by `add_document` from Firestore.
480
- answer_text, _ = rag_system.answer_question(user_question, conversation_id=current_conversation_id)
481
  # For local testing, we print the answer directly
482
- print(f"\nAI: {answer_text}")
 
10
  from PIL import Image
11
  import json # For handling JSON string (e.g., Firebase config in local test)
12
  import base64 # For decoding Base64 (e.g., Firebase config in local test)
13
+ from datetime import datetime # Import datetime for timestamps
14
  import urllib.parse # For parsing URLs
15
 
16
+
17
  # Firebase Admin SDK for Firestore
18
  import firebase_admin
19
  from firebase_admin import credentials, firestore
 
32
 
33
  # Import configurations and prompt from local modules
34
  from config import (
35
+ DEEPSEEK_API_URL, DEEPSEEK_HEADERS,
36
  EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_USE_FP16,
37
  CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME,
38
  CHUNK_SIZE, CHUNK_OVERLAP,
39
  LLM_TEMPERATURE, LLM_MAX_TOKENS, LLM_HISTORY_MAX_TOKENS,
40
+ FIREBASE_CONFIG_BASE64
41
  )
42
  from pdf_processing import extract_text_from_pdf, chunk_text
43
  from prompt import SYSTEM_PROMPT # <--- CORRECTLY IMPORTING SYSTEM_PROMPT
 
61
 
62
  if firebase_config_b64:
63
  try:
64
+ # Decode the Base64-encoded Firebase Service Account JSON
65
  cred_json = base64.b64decode(firebase_config_b64).decode('utf-8')
66
  cred_dict = json.loads(cred_json)
67
  cred = credentials.Certificate(cred_dict)
 
121
  def _generate_chunk_id(self, pdf_url: str, chunk_idx: int) -> str:
122
  """Generates a unique ID for each chunk based on PDF URL and index."""
123
  import hashlib
124
+ # Extract path without query parameters for hashing
125
+ path_without_query = urllib.parse.urlparse(pdf_url).path
126
+ url_hash = hashlib.sha256(path_without_query.encode()).hexdigest()[:10]
127
  return f"{url_hash}_{chunk_idx}_{uuid.uuid4().hex}"
128
 
129
  def add_document(self, pdf_url: str, document_name: str = None):
 
144
  print(f" Document '{display_name}' (from {pdf_url}) already in ChromaDB. Skipping re-indexing.")
145
  return
146
 
147
+ # CRITICAL FIX: Check if the file is indeed a PDF by examining the path component of the URL
148
+ parsed_url_path = urllib.parse.urlparse(pdf_url).path
149
+ file_extension_check = isinstance(parsed_url_path, str) and parsed_url_path.strip().lower().endswith('.pdf')
150
  if not file_extension_check:
151
+ print(f" DEBUG: Skipped document '{display_name}' (URL: {pdf_url}) - Not a PDF (based on file extension in URL path).")
152
  return
153
 
154
  try:
 
277
  doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
278
  doc = doc_ref.get()
279
  if doc.exists:
280
+ # History now expects a 'messages' array, and user ID might be at root
281
+ doc_data = doc.to_dict()
282
+ history = doc_data.get('messages', [])
283
+ user_id_from_db = doc_data.get('userId', 'unknown_user_from_db')
284
+ print(f"Loaded history for {conversation_id} (User: {user_id_from_db}): {len(history)} messages.")
285
  return history
286
  print(f"No history found for conversation ID: {conversation_id}")
287
  return []
288
 
289
+ def save_conversation_history(self, conversation_id: str, user_id: str, history: list[dict]):
290
+ """Saves chat history to Firestore for a given conversation ID, including user ID."""
291
  if self.firestore_db is None: # Use self.firestore_db
292
  print("Firestore not initialized. Cannot save conversation history.")
293
  return
294
 
295
  doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
296
+ # Store user ID at the top level of the document, along with the messages array
297
+ doc_ref.set({'userId': user_id, 'messages': history})
298
+ print(f"Saved history for {conversation_id} (User: {user_id}): {len(history)} messages.")
299
 
300
  def truncate_history(self, messages: list[dict], max_tokens: int = LLM_HISTORY_MAX_TOKENS) -> list[dict]:
301
  """
 
318
  return messages
319
 
320
 
321
+ def answer_question(self, question: str, conversation_id: str = None, user_id: str = "anonymous_user") -> tuple[str, str]:
322
  """
323
  Answers a question by retrieving context, and querying DeepSeek.
324
  Manages conversational memory.
 
358
  if history:
359
  messages.extend(history)
360
 
361
+ # Add current user question with timestamp
362
+ messages.append({
363
+ "role": "user",
364
+ "content": f"{context_prompt}Question: {question}",
365
+ "timestamp": datetime.now().isoformat() # Add timestamp
366
+ })
367
 
368
  # Truncate conversation history if it's too long
369
  messages = self.truncate_history(messages)
 
393
  citations_str = "\n\n**Sources:**\n" + "\n".join([f"- {name}" for name in unique_sources])
394
  answer += citations_str
395
 
396
+ # Save updated history with AI response and timestamp
397
+ messages.append({
398
+ "role": "assistant",
399
+ "content": answer,
400
+ "timestamp": datetime.now().isoformat() # Add timestamp
401
+ })
402
+ self.save_conversation_history(conversation_id, user_id, messages) # Pass user_id to save
403
+
404
  # Return the answer text AND the conversation_id
405
  return answer, conversation_id
406
  else:
 
411
  # --- Main execution logic for local testing (only runs when script is executed directly) ---
412
  if __name__ == "__main__":
413
  from dotenv import load_dotenv # Import load_dotenv for local execution
414
+ # CRITICAL FIX: Load environment variables for local testing
415
  load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env.local'))
416
 
417
  # Retrieve FIREBASE_CONFIG_BASE64 after loading dotenv (for local testing only)
 
484
 
485
  print("\n--- Chat With CompassIA (Type 'q' to exit) ---")
486
  current_conversation_id = str(uuid.uuid4())
487
+ # For local testing, we'll use a static user ID. In a real app, this would come from authentication.
488
+ current_user_id = "local_test_user_123"
489
+ print(f"Starting new local conversation with ID: {current_conversation_id} for user: {current_user_id}")
490
 
491
  while True:
492
  user_question = input("\nHow can I help you? ")
493
  if user_question.lower() == 'q':
494
+ print("Exiting chat...")
495
  break
496
 
497
+ # Pass both conversation ID and user ID to the answer_question method
498
+ answer_text, _ = rag_system.answer_question(user_question, conversation_id=current_conversation_id, user_id=current_user_id)
 
 
499
  # For local testing, we print the answer directly
500
+ print(f"\nAI: {answer_text}")
src/config.py CHANGED
@@ -1,5 +1,9 @@
1
  import os
2
 
 
 
 
 
3
  # --- OpenRouter DeepSeek API Configuration ---
4
  # Your DeepSeek API key, fetched from environment variables.
5
  # This should be set as a secret on Hugging Face Spaces.
@@ -40,9 +44,9 @@ CHUNK_OVERLAP = 100
40
  # Temperature for the DeepSeek model. Lower values make output more deterministic.
41
  LLM_TEMPERATURE = 0.5
42
  # Maximum number of tokens the LLM can generate in a response.
43
- LLM_MAX_TOKENS = 500
44
  # Max tokens for conversation history truncation (approximate, not exact token count)
45
- LLM_HISTORY_MAX_TOKENS = 3000
46
 
47
  # --- Tesseract and Poppler Configuration (Docker/Deployment Specific) ---
48
  # Environment variables set in Dockerfile for Tesseract.
 
1
  import os
2
 
3
+ from dotenv import load_dotenv # Import load_dotenv for local execution
4
+ # CRITICAL FIX: Load environment variables for local testing
5
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env.local'))
6
+
7
  # --- OpenRouter DeepSeek API Configuration ---
8
  # Your DeepSeek API key, fetched from environment variables.
9
  # This should be set as a secret on Hugging Face Spaces.
 
44
  # Temperature for the DeepSeek model. Lower values make output more deterministic.
45
  LLM_TEMPERATURE = 0.5
46
  # Maximum number of tokens the LLM can generate in a response.
47
+ LLM_MAX_TOKENS = 4096 # Adjusted to a more reasonable value for DeepSeek
48
  # Max tokens for conversation history truncation (approximate, not exact token count)
49
+ LLM_HISTORY_MAX_TOKENS = 9192
50
 
51
  # --- Tesseract and Poppler Configuration (Docker/Deployment Specific) ---
52
  # Environment variables set in Dockerfile for Tesseract.