dembasowmr commited on
Commit
fd8578e
·
1 Parent(s): a2967ae

Small cleanup

Browse files
Files changed (2) hide show
  1. .gitignore +1 -1
  2. src/TEST.PY +0 -481
.gitignore CHANGED
@@ -12,4 +12,4 @@ __pycache__/
12
 
13
  atemp/
14
 
15
- documents/
 
12
 
13
  atemp/
14
 
15
+ documents/
src/TEST.PY DELETED
@@ -1,481 +0,0 @@
1
- import sys
2
- # The pysqlite3 import and sys.modules override has been moved to app.py.
3
- # This file should NOT have its own pysqlite3 import to prevent conflicts.
4
-
5
- import requests
6
- import os
7
- import io
8
- import re
9
- import uuid # For generating unique IDs for ChromaDB and conversations
10
- from PIL import Image
11
- import json # For handling JSON string (e.g., Firebase config in local test)
12
- import base64 # For decoding Base64 (e.g., Firebase config in local test)
13
- import urllib.parse # For parsing URLs
14
-
15
-
16
- # Firebase Admin SDK for Firestore
17
- import firebase_admin
18
- from firebase_admin import credentials, firestore
19
-
20
- # For text extraction from PDFs (non-OCR)
21
- from pdfminer.high_level import extract_text_to_fp
22
- from pdfminer.layout import LAParams
23
-
24
- # For image-based PDFs (OCR)
25
- from pdf2image import convert_from_path
26
- import pytesseract
27
-
28
- # For embeddings and vector search
29
- from FlagEmbedding import BGEM3FlagModel
30
- import chromadb
31
-
32
- # Import configurations and prompt from local modules
33
- from config import (
34
- DEEPSEEK_API_URL, DEEPSEEK_HEADERS,
35
- EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_USE_FP16,
36
- CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME,
37
- CHUNK_SIZE, CHUNK_OVERLAP,
38
- LLM_TEMPERATURE, LLM_MAX_TOKENS, LLM_HISTORY_MAX_TOKENS,
39
- FIREBASE_CONFIG_BASE64
40
- )
41
- from pdf_processing import extract_text_from_pdf, chunk_text
42
- from prompt import SYSTEM_PROMPT # <--- CORRECTLY IMPORTING SYSTEM_PROMPT
43
-
44
- # --- Global Firebase Firestore Client ---
45
- # This global is primarily for __main__ (local testing) execution.
46
- # In production (via app.py), the Firestore instance will be passed directly to DocumentRAG's __init__.
47
- FIRESTORE_DATABASE = None
48
-
49
- def initialize_firebase_client():
50
- """
51
- Initializes Firebase Admin SDK and returns the Firestore client.
52
- This function is called by app.py and also by __main__ for local testing.
53
- """
54
- global FIRESTORE_DATABASE # This global is modified for local testing context.
55
-
56
- if not firebase_admin._apps: # Check if Firebase Admin SDK is already initialized
57
- # Determine Firebase config. In deployment, it comes from env vars.
58
- # For local __main__ testing, it also uses env vars.
59
- #FIREBASE_CONFIG_BASE64 = os.getenv("FIREBASE_CONFIG_BASE64") # Get from env for both cases
60
-
61
- if FIREBASE_CONFIG_BASE64:
62
- try:
63
- # Decode the Base64-encoded Firebase Service Account JSON
64
- cred_json = base64.b64decode(FIREBASE_CONFIG_BASE64).decode('utf-8')
65
- cred_dict = json.loads(cred_json)
66
- cred = credentials.Certificate(cred_dict)
67
- firebase_admin.initialize_app(cred)
68
- print("Firebase Admin SDK initialized successfully.")
69
- firestore_instance = firestore.client()
70
- FIRESTORE_DATABASE = firestore_instance # Set the global for local testing context
71
- print("Firestore client initialized successfully.")
72
- return firestore_instance # Return the instance for app.py to capture
73
- except Exception as e:
74
- print(f"Error initializing Firebase Admin SDK: {e}")
75
- print("Please ensure FIREBASE_CONFIG_BASE64 is correctly set and is a valid Base64-encoded Service Account JSON.")
76
- FIRESTORE_DATABASE = None
77
- return None
78
- else:
79
- print("Warning: FIREBASE_CONFIG_BASE64 environment variable not found. Firestore will not be available.")
80
- FIRESTORE_DATABASE = None
81
- return None
82
- else: # Already initialized (e.g., by app.py's first call)
83
- print("Firebase Admin SDK already initialized.")
84
- # Ensure global variable is set if already initialized, for local testing context.
85
- # This branch ensures the global FIRESTORE_DATABASE is available even if `app.py` already init'd it.
86
- if FIRESTORE_DATABASE is None:
87
- FIRESTORE_DATABASE = firestore.client()
88
- return firestore.client() # Always return the current Firestore client instance
89
-
90
- # --- Embedding Model Initialization ---
91
- print("Loading FlagEmbedding (BGE-M3) model...")
92
- try:
93
- embedding_model = BGEM3FlagModel(EMBEDDING_MODEL_NAME, use_fp16=EMBEDDING_MODEL_USE_FP16)
94
- print("FlagEmbedding (BGE-M3) model loaded successfully.")
95
- except Exception as e:
96
- print(f"Error loading FlagEmbedding model: {e}")
97
- print("Ensure disk space and memory are sufficient for model download.")
98
- print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
99
- sys.exit(1) # Use sys.exit for clean exit in non-FastAPI contexts
100
-
101
-
102
- class DocumentRAG:
103
- def __init__(self, embedding_model, persist_directory=CHROMADB_PERSIST_DIRECTORY, collection_name=CHROMADB_COLLECTION_NAME, firestore_db_instance=None):
104
- self.embedding_model = embedding_model
105
- self.persist_directory = persist_directory
106
- self.collection_name = collection_name
107
- self.chunk_size = CHUNK_SIZE
108
- self.overlap = CHUNK_OVERLAP
109
- self.firestore_db = firestore_db_instance # CRITICAL: Store the injected Firestore instance
110
-
111
- print(f"Initializing ChromaDB at: {self.persist_directory}")
112
- self.client = chromadb.PersistentClient(path=self.persist_directory)
113
-
114
- self.collection = self.client.get_or_create_collection(
115
- name=self.collection_name,
116
- metadata={"hnsw:space": "cosine"}
117
- )
118
- print(f"ChromaDB collection '{self.collection_name}' ready. Total chunks: {self.collection.count()}")
119
-
120
- def _generate_chunk_id(self, pdf_url: str, chunk_idx: int) -> str:
121
- """Generates a unique ID for each chunk based on PDF URL and index."""
122
- import hashlib
123
- # Extract path without query parameters for hashing
124
- path_without_query = urllib.parse.urlparse(pdf_url).path
125
- url_hash = hashlib.sha256(path_without_query.encode()).hexdigest()[:10]
126
- return f"{url_hash}_{chunk_idx}_{uuid.uuid4().hex}"
127
-
128
- def add_document(self, pdf_url: str, document_name: str = None):
129
- """
130
- Adds a PDF document to the RAG system, processing and indexing its content.
131
- Downloads the PDF from the URL.
132
- """
133
- # Determine display name from parsed URL path if not provided
134
- parsed_url_path = urllib.parse.urlparse(pdf_url).path
135
- display_name = document_name if document_name else os.path.basename(parsed_url_path)
136
- print(f"Adding document from URL: {pdf_url} (Display Name: {display_name})")
137
-
138
- results = self.collection.get(
139
- where={"source": pdf_url},
140
- limit=1
141
- )
142
- if results and results['ids']:
143
- print(f" Document '{display_name}' (from {pdf_url}) already in ChromaDB. Skipping re-indexing.")
144
- return
145
-
146
- # CRITICAL FIX: Check if the file is indeed a PDF by examining the path component of the URL
147
- parsed_url_path = urllib.parse.urlparse(pdf_url).path
148
- file_extension_check = isinstance(parsed_url_path, str) and parsed_url_path.strip().lower().endswith('.pdf')
149
- if not file_extension_check:
150
- print(f" DEBUG: Skipped document '{display_name}' (URL: {pdf_url}) - Not a PDF (based on file extension in URL path).")
151
- return
152
-
153
- try:
154
- response = requests.get(pdf_url, stream=True)
155
- print(f" DEBUG: HTTP Status Code for {pdf_url}: {response.status_code}")
156
- response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
157
-
158
- pdf_data = io.BytesIO(response.content)
159
- print(f" DEBUG: BytesIO content length for {pdf_url}: {pdf_data.getbuffer().nbytes} bytes")
160
-
161
- if pdf_data.getbuffer().nbytes == 0:
162
- raise ValueError(f"Downloaded PDF content from {pdf_url} is empty.")
163
-
164
- # Create a temporary file to save the PDF for processing
165
- temp_pdf_path = f"/tmp/{uuid.uuid4().hex}.pdf"
166
- os.makedirs(os.path.dirname(temp_pdf_path), exist_ok=True) # Ensure /tmp exists
167
-
168
- with open(temp_pdf_path, 'wb') as f:
169
- f.write(pdf_data.getvalue())
170
- print(f" DEBUG: Temporary PDF saved to: {temp_pdf_path}")
171
-
172
- extracted_text = extract_text_from_pdf(temp_pdf_path)
173
- os.remove(temp_pdf_path) # Clean up the temporary file after extraction
174
-
175
- except requests.exceptions.RequestException as e:
176
- print(f"Error downloading PDF from {pdf_url}: {e}")
177
- return
178
- except ValueError as e:
179
- print(f"Error processing downloaded PDF {pdf_url}: {e}")
180
- return
181
- except Exception as e:
182
- print(f"Error processing downloaded PDF {pdf_url}: {e}")
183
- return
184
-
185
- if not extracted_text:
186
- print(f"Warning: No text extracted from {display_name} ({pdf_url}). Skipping.")
187
- return
188
-
189
- chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
190
- if not chunks:
191
- print(f"Warning: No chunks generated for {display_name} ({pdf_url}). Skipping.")
192
- return
193
-
194
- documents_to_add = []
195
- metadatas_to_add = []
196
- ids_to_add = []
197
-
198
- print(f" Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB: {display_name}...")
199
-
200
- encoded_results = self.embedding_model.encode(
201
- chunks,
202
- batch_size=32,
203
- return_dense=True,
204
- return_sparse=False,
205
- return_colbert_vecs=False
206
- )
207
- chunk_embeddings = encoded_results["dense_vecs"]
208
-
209
- for i, chunk in enumerate(chunks):
210
- unique_id = self._generate_chunk_id(pdf_url, i)
211
- documents_to_add.append(chunk)
212
- metadatas_to_add.append({"source": pdf_url, "display_name": display_name, "chunk_id": i})
213
- ids_to_add.append(unique_id)
214
-
215
- self.collection.add(
216
- documents=documents_to_add,
217
- embeddings=chunk_embeddings.tolist(),
218
- metadatas=metadatas_to_add,
219
- ids=ids_to_add
220
- )
221
-
222
- print(f" {len(documents_to_add)} chunks from '{display_name}' added to ChromaDB.")
223
- print(f" Total chunks in collection: {self.collection.count()}")
224
-
225
- def retrieve_context(self, query: str, top_k: int = 3) -> list[dict]:
226
- """
227
- Retrieves top_k most relevant document chunks for a given query from ChromaDB.
228
- Returns a list of dictionaries, each containing 'text' and 'source' (URL or display name).
229
- """
230
- if self.collection.count() == 0:
231
- print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
232
- return []
233
-
234
- print(f"Retrieving context for query: '{query}'")
235
-
236
- query_embedding_result = self.embedding_model.encode(
237
- [query],
238
- batch_size=1,
239
- return_dense=True,
240
- return_sparse=False,
241
- return_colbert_vecs=False
242
- )
243
- query_embedding = query_embedding_result["dense_vecs"].tolist()
244
-
245
- results = self.collection.query(
246
- query_embeddings=query_embedding,
247
- n_results=top_k,
248
- include=['documents', 'distances', 'metadatas']
249
- )
250
-
251
- retrieved_chunks_info = []
252
- if results and results['documents']:
253
- for i, doc_text in enumerate(results['documents'][0]):
254
- source_url = results['metadatas'][0][i].get('source', 'Unknown URL')
255
- display_name = results['metadatas'][0][i].get('display_name', os.path.basename(urllib.parse.urlparse(source_url).path))
256
- chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
257
- distance_info = results['distances'][0][i]
258
-
259
- retrieved_chunks_info.append({
260
- "text": doc_text,
261
- "source_url": source_url,
262
- "display_name": display_name
263
- })
264
- print(f" Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{display_name}' (chunk {chunk_id_info}).")
265
- else:
266
- print(" No relevant chunks found in ChromaDB.")
267
-
268
- return retrieved_chunks_info
269
-
270
- def get_conversation_history(self, conversation_id: str) -> list[dict]:
271
- """Loads chat history from Firestore for a given conversation ID."""
272
- if self.firestore_db is None: # Use self.firestore_db
273
- print("Firestore not initialized. Cannot load conversation history.")
274
- return []
275
-
276
- doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
277
- doc = doc_ref.get()
278
- if doc.exists:
279
- history = doc.to_dict().get('messages', [])
280
- print(f"Loaded history for {conversation_id}: {len(history)} messages.")
281
- return history
282
- print(f"No history found for conversation ID: {conversation_id}")
283
- return []
284
-
285
- def save_conversation_history(self, conversation_id: str, history: list[dict]):
286
- """Saves chat history to Firestore for a given conversation ID."""
287
- if self.firestore_db is None: # Use self.firestore_db
288
- print("Firestore not initialized. Cannot save conversation history.")
289
- return
290
-
291
- doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
292
- doc_ref.set({'messages': history})
293
- print(f"Saved history for {conversation_id}: {len(history)} messages.")
294
-
295
- def truncate_history(self, messages: list[dict], max_tokens: int = LLM_HISTORY_MAX_TOKENS) -> list[dict]:
296
- """
297
- Truncates conversation history to fit within a max_tokens limit for the LLM.
298
- This is a simplistic truncation and doesn't use a tokenizer for exact token count.
299
- """
300
- current_len = sum(len(m['content']) for m in messages)
301
- while current_len > max_tokens and len(messages) > 1: # Keep at least 1 message
302
- if messages[0]['role'] == 'system':
303
- if len(messages) >= 3:
304
- removed_user_msg = messages.pop(1)
305
- removed_ai_msg = messages.pop(1)
306
- current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
307
- else:
308
- break
309
- else:
310
- removed_user_msg = messages.pop(0)
311
- removed_ai_msg = messages.pop(0)
312
- current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
313
- return messages
314
-
315
-
316
- def answer_question(self, question: str, conversation_id: str = None) -> tuple[str, str]:
317
- """
318
- Answers a question by retrieving context, and querying DeepSeek.
319
- Manages conversational memory.
320
- Returns a tuple: (answer_text, final_conversation_id_used).
321
- """
322
- # >>> MODIFICATION: Ensure conversation_id is always present and return it <<<
323
- if conversation_id is None:
324
- conversation_id = str(uuid.uuid4()) # Generate new ID if not provided
325
- print(f"No conversation_id provided. Generating new one: {conversation_id}")
326
- # >>> END MODIFICATION <<<
327
-
328
- # Get relevant context from ChromaDB
329
- context_chunks_info = self.retrieve_context(question)
330
-
331
- context_parts = []
332
- citation_info = {} # To store unique display names for citation
333
-
334
- for chunk_info in context_chunks_info:
335
- context_parts.append(chunk_info["text"])
336
- source_key = chunk_info.get("display_name", chunk_info["source_url"])
337
- if source_key not in citation_info:
338
- citation_info[source_key] = True # Using a dict/set for unique sources
339
-
340
- context = "\n\n".join(context_parts)
341
-
342
- context_prompt = ""
343
- if context:
344
- context_prompt = f"Using the following context:\n\n{context}\n\n"
345
- else:
346
- print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")
347
-
348
- # --- Use the imported SYSTEM_PROMPT from prompt.py ---
349
- messages = [{"role": "system", "content": SYSTEM_PROMPT}] # Use the imported SYSTEM_PROMPT
350
-
351
- # Use the (possibly newly generated) conversation_id
352
- history = self.get_conversation_history(conversation_id)
353
- if history:
354
- messages.extend(history)
355
-
356
- # Add current context and question
357
- messages.append({"role": "user", "content": f"{context_prompt}Question: {question}"})
358
-
359
- # Truncate conversation history if it's too long
360
- messages = self.truncate_history(messages)
361
-
362
- # Call DeepSeek API via OpenRouter
363
- print("\nSending request to DeepSeek API...")
364
- data = {
365
- "model": "deepseek/deepseek-chat:free",
366
- "messages": messages,
367
- "temperature": LLM_TEMPERATURE,
368
- "max_tokens": LLM_MAX_TOKENS,
369
- }
370
-
371
- # CRITICAL FIX: Use DEEPSEEK_HEADERS imported from config.py
372
- response = requests.post(DEEPSEEK_API_URL, json=data, headers=DEEPSEEK_HEADERS)
373
-
374
- if response.status_code == 200:
375
- ai_response = response.json()
376
- answer = ai_response['choices'][0]['message']['content']
377
- print("\nDeepSeek Response:")
378
- print(answer)
379
-
380
- # Logic to append sources to the answer
381
- if citation_info:
382
- # Get unique display names and sort them for consistent output
383
- unique_sources = sorted(list(citation_info.keys()))
384
- citations_str = "\n\n**Sources:**\n" + "\n".join([f"- {name}" for name in unique_sources])
385
- answer += citations_str
386
-
387
- # Save updated history using the conversation_id
388
- messages.append({"role": "assistant", "content": answer})
389
- self.save_conversation_history(conversation_id, messages)
390
-
391
- # Return the answer text AND the conversation_id
392
- return answer, conversation_id
393
- else:
394
- error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
395
- print(error_message)
396
- return f"Error: Could not get an answer from the AI. Details: {error_message}", conversation_id # Still return conv_id even on error
397
-
398
- # --- Main execution logic for local testing (only runs when script is executed directly) ---
399
- if __name__ == "__main__":
400
-
401
- # For local testing, initialize Firebase and capture the instance
402
- local_firestore_instance = initialize_firebase_client()
403
-
404
- rag_system = DocumentRAG(
405
- embedding_model=embedding_model,
406
- persist_directory=CHROMADB_PERSIST_DIRECTORY,
407
- collection_name=CHROMADB_COLLECTION_NAME,
408
- firestore_db_instance=local_firestore_instance # Pass the instance here for local testing
409
- )
410
-
411
- print("\n--- Indexing Documents ---")
412
- if local_firestore_instance: # Use local_firestore_instance for checking
413
- try:
414
- docs_ref = local_firestore_instance.collection('documents').stream()
415
- firestore_pdf_infos = []
416
- documents_processed_count = 0
417
- documents_skipped_non_pdf_count = 0
418
-
419
- for doc in docs_ref:
420
- documents_processed_count += 1
421
- doc_data = doc.to_dict()
422
- print(f" DEBUG: Processing document ID: {doc.id}, Data: {doc_data}")
423
-
424
- if 'fileUrl' in doc_data:
425
- pdf_url = doc_data['fileUrl']
426
- print(f" DEBUG: Found 'fileUrl': {pdf_url}")
427
-
428
- # add_document now handles the PDF check internally, so no need for it here
429
- display_name = doc_data.get('name_en', None)
430
- firestore_pdf_infos.append({"url": pdf_url, "name": display_name})
431
- else:
432
- documents_skipped_non_pdf_count += 1
433
- print(f" DEBUG: Document ID: {doc.id} does not contain 'fileUrl'. Document data: {doc.data}")
434
-
435
- if documents_processed_count == 0:
436
- print("No documents found in Firestore collection 'documents' via stream(). Please check collection name and security rules.")
437
- elif documents_processed_count > 0 and not firestore_pdf_infos:
438
- print(f"Found {documents_processed_count} documents in Firestore, but none matched the '.pdf' criteria or had 'fileUrl'.")
439
- elif documents_skipped_non_pdf_count > 0:
440
- print(f"Found {documents_processed_count} documents in Firestore. {len(firestore_pdf_infos)} URLs found, {documents_skipped_non_pdf_count} documents skipped (non-URL or non-PDF by add_document).")
441
-
442
-
443
- if firestore_pdf_infos:
444
- for pdf_info in firestore_pdf_infos:
445
- # rag_system.add_document will internally check for PDF extension
446
- rag_system.add_document(pdf_info['url'], pdf_info['name'])
447
- else:
448
- pass
449
-
450
- except Exception as e:
451
- print(f"Error fetching documents from Firestore: {e}")
452
- print("Please ensure your Firestore database is accessible and the service account key is correct.")
453
- else:
454
- print("Firestore client not initialized. Cannot fetch documents from Firestore.")
455
- print("Using local PDF_DOCUMENT_PATHS as a fallback for testing purposes (ensure these files exist).")
456
- # This import is moved here to avoid circular dependency if config imports rag_system
457
- from config import PDF_DOCUMENT_PATHS # This path is for local testing only
458
- for pdf_path in PDF_DOCUMENT_PATHS:
459
- if os.path.exists(pdf_path):
460
- rag_system.add_document(pdf_path)
461
- else:
462
- print(f"Error: Local PDF file not found at {pdf_path}. Skipping.")
463
-
464
-
465
- print("\n--- Chat With CompassIA (Type 'q' to exit) ---")
466
- current_conversation_id = str(uuid.uuid4())
467
- print(f"Starting new local conversation with ID: {current_conversation_id}")
468
-
469
- while True:
470
- user_question = input("\nHow can I help you? ")
471
- if user_question.lower() == 'q':
472
- print("Exiting chat...")
473
- break
474
-
475
- # In local testing, we still pass the conversation ID.
476
- # pdf_paths is no longer passed as a list of paths here, as document processing
477
- # is handled on startup by `add_document` from Firestore.
478
- answer_text, _ = rag_system.answer_question(user_question, conversation_id=current_conversation_id)
479
- # For local testing, we print the answer directly
480
- print(f"\nAI: {answer_text}")
481
-