File size: 23,754 Bytes
15d9931
f91e3a6
 
 
15d9931
 
f91e3a6
 
 
 
 
 
a2967ae
15d9931
 
a2967ae
15d9931
 
 
 
f91e3a6
 
 
 
 
 
 
 
15d9931
 
 
 
9ac318d
 
 
 
 
 
 
15d9931
 
a2967ae
15d9931
 
 
 
a2967ae
15d9931
f91e3a6
0f574db
15d9931
 
f91e3a6
 
 
15d9931
 
f91e3a6
 
 
 
 
 
 
 
 
 
9ac318d
15d9931
a2967ae
9ac318d
15d9931
 
 
 
f91e3a6
 
15d9931
f91e3a6
15d9931
 
 
 
 
 
 
 
 
f91e3a6
15d9931
f91e3a6
 
 
 
 
15d9931
 
 
 
 
 
 
 
 
 
f91e3a6
15d9931
 
 
f91e3a6
15d9931
 
 
 
f91e3a6
 
 
15d9931
 
 
 
 
 
 
 
 
 
 
 
a2967ae
 
 
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2967ae
 
 
f91e3a6
a2967ae
f91e3a6
 
15d9931
 
f91e3a6
15d9931
 
 
f91e3a6
15d9931
 
f91e3a6
15d9931
f91e3a6
15d9931
f91e3a6
15d9931
 
 
f91e3a6
15d9931
 
 
 
 
 
 
f91e3a6
15d9931
 
f91e3a6
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f91e3a6
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f91e3a6
15d9931
 
 
f91e3a6
15d9931
 
a2967ae
 
 
 
 
15d9931
 
 
 
a2967ae
 
f91e3a6
15d9931
 
 
f91e3a6
a2967ae
 
 
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2967ae
15d9931
 
f91e3a6
 
15d9931
f91e3a6
 
 
 
 
15d9931
 
f91e3a6
15d9931
 
f91e3a6
15d9931
 
 
 
 
f91e3a6
15d9931
 
 
 
 
 
 
 
 
0f574db
 
15d9931
f91e3a6
 
 
 
15d9931
a2967ae
 
 
 
 
 
15d9931
 
 
 
 
 
 
 
 
 
 
 
 
8bed3b8
 
15d9931
 
 
 
 
 
 
f91e3a6
15d9931
f91e3a6
 
 
15d9931
 
a2967ae
 
 
 
 
 
 
 
f91e3a6
 
15d9931
 
 
f91e3a6
15d9931
f91e3a6
15d9931
 
f91e3a6
 
15d9931
 
 
 
f91e3a6
 
15d9931
 
 
f91e3a6
15d9931
f91e3a6
15d9931
f91e3a6
 
15d9931
 
 
 
 
 
 
 
 
 
f91e3a6
 
 
15d9931
 
f91e3a6
15d9931
 
 
 
f91e3a6
15d9931
f91e3a6
15d9931
 
 
 
f91e3a6
 
15d9931
f91e3a6
15d9931
 
 
 
 
 
 
 
f91e3a6
15d9931
 
 
 
 
 
 
 
f91e3a6
a2967ae
 
 
15d9931
 
 
 
a2967ae
15d9931
 
a2967ae
 
f91e3a6
a2967ae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
import sys
# The pysqlite3 import and sys.modules override has been moved to app.py.
# This file should NOT have its own pysqlite3 import to prevent conflicts.

import requests
import os
import io
import re
import uuid # For generating unique IDs for ChromaDB and conversations
from PIL import Image
import json # For handling JSON string (e.g., Firebase config in local test)
import base64 # For decoding Base64 (e.g., Firebase config in local test)
from datetime import datetime # Import datetime for timestamps
import urllib.parse # For parsing URLs


# Firebase Admin SDK for Firestore
import firebase_admin
from firebase_admin import credentials, firestore

# For text extraction from PDFs (non-OCR)
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams

# For image-based PDFs (OCR)
from pdf2image import convert_from_path
import pytesseract

# For embeddings and vector search
from FlagEmbedding import BGEM3FlagModel
import chromadb

from dotenv import load_dotenv # Import load_dotenv for local execution
# CRITICAL FIX: Load environment variables for local testing
load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env.local'))

# Retrieve FIREBASE_CONFIG_BASE64 after loading dotenv (for local testing only)
# This value is read from config.py, which in turn reads from .env.local

# Import configurations and prompt from local modules
from config import (
    DEEPSEEK_API_URL, DEEPSEEK_HEADERS,
    EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_USE_FP16,
    CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME,
    CHUNK_SIZE, CHUNK_OVERLAP,
    LLM_TEMPERATURE, LLM_MAX_TOKENS, LLM_HISTORY_MAX_TOKENS,
    FIREBASE_CONFIG_BASE64
)
from pdf_processing import extract_text_from_pdf, chunk_text
from prompt import SYSTEM_PROMPT # <--- CORRECTLY IMPORTING SYSTEM_PROMPT

# --- Global Firebase Firestore Client ---
# This global is primarily for __main__ (local testing) execution.
# In production (via app.py), the Firestore instance will be passed directly to DocumentRAG's __init__.
FIRESTORE_DATABASE = None 

def initialize_firebase_client():
    """
    Initializes Firebase Admin SDK and returns the Firestore client.
    This function is called by app.py and also by __main__ for local testing.
    """
    global FIRESTORE_DATABASE # This global is modified for local testing context.
    
    if not firebase_admin._apps: # Check if Firebase Admin SDK is already initialized
        # Determine Firebase config. In deployment, it comes from env vars.
        # For local __main__ testing, it also uses env vars.

        if FIREBASE_CONFIG_BASE64:
            try:
                # Decode the Base64-encoded Firebase Service Account JSON
                cred_json = base64.b64decode(FIREBASE_CONFIG_BASE64).decode('utf-8')
                cred_dict = json.loads(cred_json)
                cred = credentials.Certificate(cred_dict)
                firebase_admin.initialize_app(cred)
                print("Firebase Admin SDK initialized successfully.")
                firestore_instance = firestore.client()
                FIRESTORE_DATABASE = firestore_instance # Set the global for local testing context
                print("Firestore client initialized successfully.")
                return firestore_instance # Return the instance for app.py to capture
            except Exception as e:
                print(f"Error initializing Firebase Admin SDK: {e}")
                print("Please ensure FIREBASE_CONFIG_BASE64 is correctly set and is a valid Base64-encoded Service Account JSON.")
                FIRESTORE_DATABASE = None
                return None
        else:
            print("Warning: FIREBASE_CONFIG_BASE64 environment variable not found. Firestore will not be available.")
            FIRESTORE_DATABASE = None
            return None
    else: # Already initialized (e.g., by app.py's first call)
        print("Firebase Admin SDK already initialized.")
        # Ensure global variable is set if already initialized, for local testing context.
        # This branch ensures the global FIRESTORE_DATABASE is available even if `app.py` already init'd it.
        if FIRESTORE_DATABASE is None: 
            FIRESTORE_DATABASE = firestore.client()
        return firestore.client() # Always return the current Firestore client instance

# --- Embedding Model Initialization ---
print("Loading FlagEmbedding (BGE-M3) model...")
try:
    embedding_model = BGEM3FlagModel(EMBEDDING_MODEL_NAME, use_fp16=EMBEDDING_MODEL_USE_FP16)
    print("FlagEmbedding (BGE-M3) model loaded successfully.")
except Exception as e:
    print(f"Error loading FlagEmbedding model: {e}")
    print("Ensure disk space and memory are sufficient for model download.")
    print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
    sys.exit(1) # Use sys.exit for clean exit in non-FastAPI contexts


class DocumentRAG:
    def __init__(self, embedding_model, persist_directory=CHROMADB_PERSIST_DIRECTORY, collection_name=CHROMADB_COLLECTION_NAME, firestore_db_instance=None):
        self.embedding_model = embedding_model
        self.persist_directory = persist_directory
        self.collection_name = collection_name
        self.chunk_size = CHUNK_SIZE
        self.overlap = CHUNK_OVERLAP 
        self.firestore_db = firestore_db_instance # CRITICAL: Store the injected Firestore instance

        print(f"Initializing ChromaDB at: {self.persist_directory}")
        self.client = chromadb.PersistentClient(path=self.persist_directory)
        
        self.collection = self.client.get_or_create_collection(
            name=self.collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        print(f"ChromaDB collection '{self.collection_name}' ready. Total chunks: {self.collection.count()}")

    def _generate_chunk_id(self, pdf_url: str, chunk_idx: int) -> str:
        """Generates a unique ID for each chunk based on PDF URL and index."""
        import hashlib
        # Extract path without query parameters for hashing
        path_without_query = urllib.parse.urlparse(pdf_url).path
        url_hash = hashlib.sha256(path_without_query.encode()).hexdigest()[:10]
        return f"{url_hash}_{chunk_idx}_{uuid.uuid4().hex}"

    def add_document(self, pdf_url: str, document_name: str = None):
        """
        Adds a PDF document to the RAG system, processing and indexing its content.
        Downloads the PDF from the URL.
        """
        # Determine display name from parsed URL path if not provided
        parsed_url_path = urllib.parse.urlparse(pdf_url).path
        display_name = document_name if document_name else os.path.basename(parsed_url_path)
        print(f"Adding document from URL: {pdf_url} (Display Name: {display_name})")
        
        results = self.collection.get(
            where={"source": pdf_url},
            limit=1
        )
        if results and results['ids']:
            print(f"  Document '{display_name}' (from {pdf_url}) already in ChromaDB. Skipping re-indexing.")
            return

        # CRITICAL FIX: Check if the file is indeed a PDF by examining the path component of the URL
        parsed_url_path = urllib.parse.urlparse(pdf_url).path
        file_extension_check = isinstance(parsed_url_path, str) and parsed_url_path.strip().lower().endswith('.pdf')
        if not file_extension_check:
            print(f"  DEBUG: Skipped document '{display_name}' (URL: {pdf_url}) - Not a PDF (based on file extension in URL path).")
            return

        try:
            response = requests.get(pdf_url, stream=True)
            print(f"  DEBUG: HTTP Status Code for {pdf_url}: {response.status_code}")
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
            
            pdf_data = io.BytesIO(response.content)
            print(f"  DEBUG: BytesIO content length for {pdf_url}: {pdf_data.getbuffer().nbytes} bytes")
            
            if pdf_data.getbuffer().nbytes == 0:
                raise ValueError(f"Downloaded PDF content from {pdf_url} is empty.")

            # Create a temporary file to save the PDF for processing
            temp_pdf_path = f"/tmp/{uuid.uuid4().hex}.pdf"
            os.makedirs(os.path.dirname(temp_pdf_path), exist_ok=True) # Ensure /tmp exists
            
            with open(temp_pdf_path, 'wb') as f:
                f.write(pdf_data.getvalue())
            print(f"  DEBUG: Temporary PDF saved to: {temp_pdf_path}")
            
            extracted_text = extract_text_from_pdf(temp_pdf_path)
            os.remove(temp_pdf_path) # Clean up the temporary file after extraction
        
        except requests.exceptions.RequestException as e:
            print(f"Error downloading PDF from {pdf_url}: {e}")
            return
        except ValueError as e: 
            print(f"Error processing downloaded PDF {pdf_url}: {e}")
            return
        except Exception as e:
            print(f"Error processing downloaded PDF {pdf_url}: {e}")
            return

        if not extracted_text:
            print(f"Warning: No text extracted from {display_name} ({pdf_url}). Skipping.")
            return

        chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
        if not chunks:
            print(f"Warning: No chunks generated for {display_name} ({pdf_url}). Skipping.")
            return

        documents_to_add = []
        metadatas_to_add = []
        ids_to_add = []

        print(f"  Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB: {display_name}...")
        
        encoded_results = self.embedding_model.encode(
            chunks, 
            batch_size=32,
            return_dense=True, 
            return_sparse=False, 
            return_colbert_vecs=False
        )
        chunk_embeddings = encoded_results["dense_vecs"]

        for i, chunk in enumerate(chunks):
            unique_id = self._generate_chunk_id(pdf_url, i)
            documents_to_add.append(chunk)
            metadatas_to_add.append({"source": pdf_url, "display_name": display_name, "chunk_id": i})
            ids_to_add.append(unique_id)
        
        self.collection.add(
            documents=documents_to_add,
            embeddings=chunk_embeddings.tolist(),
            metadatas=metadatas_to_add,
            ids=ids_to_add
        )
        
        print(f"  {len(documents_to_add)} chunks from '{display_name}' added to ChromaDB.")
        print(f"  Total chunks in collection: {self.collection.count()}")

    def retrieve_context(self, query: str, top_k: int = 3) -> list[dict]: 
        """
        Retrieves top_k most relevant document chunks for a given query from ChromaDB.
        Returns a list of dictionaries, each containing 'text' and 'source' (URL or display name).
        """
        if self.collection.count() == 0:
            print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
            return []

        print(f"Retrieving context for query: '{query}'")
        
        query_embedding_result = self.embedding_model.encode(
            [query], 
            batch_size=1,
            return_dense=True, 
            return_sparse=False, 
            return_colbert_vecs=False
        )
        query_embedding = query_embedding_result["dense_vecs"].tolist()

        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=top_k,
            include=['documents', 'distances', 'metadatas']
        )
        
        retrieved_chunks_info = []
        if results and results['documents']:
            for i, doc_text in enumerate(results['documents'][0]):
                source_url = results['metadatas'][0][i].get('source', 'Unknown URL')
                display_name = results['metadatas'][0][i].get('display_name', os.path.basename(urllib.parse.urlparse(source_url).path))
                chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
                distance_info = results['distances'][0][i]

                retrieved_chunks_info.append({
                    "text": doc_text,
                    "source_url": source_url,
                    "display_name": display_name
                })
                print(f"  Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{display_name}' (chunk {chunk_id_info}).")
        else:
            print("  No relevant chunks found in ChromaDB.")
            
        return retrieved_chunks_info

    def get_conversation_history(self, conversation_id: str) -> list[dict]:
        """Loads chat history from Firestore for a given conversation ID."""
        if self.firestore_db is None: # Use self.firestore_db
            print("Firestore not initialized. Cannot load conversation history.")
            return []
        
        doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
        doc = doc_ref.get()
        if doc.exists:
            # History now expects a 'messages' array, and user ID might be at root
            doc_data = doc.to_dict()
            history = doc_data.get('messages', [])
            user_id_from_db = doc_data.get('userId', 'unknown_user_from_db')
            print(f"Loaded history for {conversation_id} (User: {user_id_from_db}): {len(history)} messages.")
            return history
        print(f"No history found for conversation ID: {conversation_id}")
        return []

    def save_conversation_history(self, conversation_id: str, user_id: str, history: list[dict]):
        """Saves chat history to Firestore for a given conversation ID, including user ID."""
        if self.firestore_db is None: # Use self.firestore_db
            print("Firestore not initialized. Cannot save conversation history.")
            return

        doc_ref = self.firestore_db.collection('conversations').document(conversation_id) # Use self.firestore_db
        # Store user ID at the top level of the document, along with the messages array
        doc_ref.set({'userId': user_id, 'messages': history})
        print(f"Saved history for {conversation_id} (User: {user_id}): {len(history)} messages.")

    def truncate_history(self, messages: list[dict], max_tokens: int = LLM_HISTORY_MAX_TOKENS) -> list[dict]:
        """
        Truncates conversation history to fit within a max_tokens limit for the LLM.
        This is a simplistic truncation and doesn't use a tokenizer for exact token count.
        """
        current_len = sum(len(m['content']) for m in messages)
        while current_len > max_tokens and len(messages) > 1: # Keep at least 1 message
            if messages[0]['role'] == 'system':
                if len(messages) >= 3:
                    removed_user_msg = messages.pop(1)
                    removed_ai_msg = messages.pop(1)
                    current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
                else:
                    break
            else:
                removed_user_msg = messages.pop(0)
                removed_ai_msg = messages.pop(0)
                current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
        return messages


    def answer_question(self, question: str, conversation_id: str = None, user_id: str = "anonymous_user") -> tuple[str, str]:
        """
        Answers a question by retrieving context, and querying DeepSeek.
        Manages conversational memory.
        Returns a tuple: (answer_text, final_conversation_id_used).
        """
        # >>> MODIFICATION: Ensure conversation_id is always present and return it <<<
        if conversation_id is None:
            conversation_id = str(uuid.uuid4()) # Generate new ID if not provided
            print(f"No conversation_id provided. Generating new one: {conversation_id}")
        # >>> END MODIFICATION <<<

        # Get relevant context from ChromaDB
        context_chunks_info = self.retrieve_context(question) 
        
        context_parts = []
        citation_info = {} # To store unique display names for citation

        for chunk_info in context_chunks_info:
            context_parts.append(chunk_info["text"])
            source_key = chunk_info.get("display_name", chunk_info["source_url"])
            if source_key not in citation_info:
                citation_info[source_key] = True # Using a dict/set for unique sources

        context = "\n\n".join(context_parts)

        context_prompt = ""
        if context:
            context_prompt = f"Using the following context:\n\n{context}\n\n"
        else:
            print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")

        # --- Use the imported SYSTEM_PROMPT from prompt.py ---
        messages = [{"role": "system", "content": SYSTEM_PROMPT}] # Use the imported SYSTEM_PROMPT

        # Use the (possibly newly generated) conversation_id
        history = self.get_conversation_history(conversation_id)
        if history:
            messages.extend(history)

        # Add current user question with timestamp
        messages.append({
            "role": "user",
            "content": f"{context_prompt}Question: {question}",
            "timestamp": datetime.now().isoformat() # Add timestamp
        })

        # Truncate conversation history if it's too long
        messages = self.truncate_history(messages)

        # Call DeepSeek API via OpenRouter
        print("\nSending request to DeepSeek API...")
        data = {
            "model": "deepseek/deepseek-chat:free",
            "messages": messages,
            "temperature": LLM_TEMPERATURE,
            "max_tokens": LLM_MAX_TOKENS,
        }

        # CRITICAL FIX: Use DEEPSEEK_HEADERS imported from config.py
        response = requests.post(DEEPSEEK_API_URL, json=data, headers=DEEPSEEK_HEADERS) 

        if response.status_code == 200:
            ai_response = response.json()
            answer = ai_response['choices'][0]['message']['content']
            print("\nDeepSeek Response:")
            print(answer)

            # Logic to append sources to the answer
            if citation_info:
                # Get unique display names and sort them for consistent output
                unique_sources = sorted(list(citation_info.keys()))
                citations_str = "\n\n**Sources:**\n" + "\n".join([f"- {name}" for name in unique_sources])
                answer += citations_str

            # Save updated history with AI response and timestamp
            messages.append({
                "role": "assistant",
                "content": answer,
                "timestamp": datetime.now().isoformat() # Add timestamp
            })
            self.save_conversation_history(conversation_id, user_id, messages) # Pass user_id to save

            # Return the answer text AND the conversation_id
            return answer, conversation_id
        else:
            error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
            print(error_message)
            return f"Error: Could not get an answer from the AI. Details: {error_message}", conversation_id # Still return conv_id even on error

# --- Main execution logic for local testing (only runs when script is executed directly) ---
if __name__ == "__main__":

    # For local testing, initialize Firebase and capture the instance
    local_firestore_instance = initialize_firebase_client()

    rag_system = DocumentRAG(
        embedding_model=embedding_model,
        persist_directory=CHROMADB_PERSIST_DIRECTORY,
        collection_name=CHROMADB_COLLECTION_NAME,
        firestore_db_instance=local_firestore_instance # Pass the instance here for local testing
    )

    print("\n--- Indexing Documents ---")
    if local_firestore_instance: # Use local_firestore_instance for checking
        try:
            docs_ref = local_firestore_instance.collection('documents').stream()
            firestore_pdf_infos = []
            documents_processed_count = 0 
            documents_skipped_non_pdf_count = 0 

            for doc in docs_ref:
                documents_processed_count += 1
                doc_data = doc.to_dict()
                print(f"  DEBUG: Processing document ID: {doc.id}, Data: {doc_data}")
                
                if 'fileUrl' in doc_data:
                    pdf_url = doc_data['fileUrl']
                    print(f"  DEBUG: Found 'fileUrl': {pdf_url}")
                    
                    # add_document now handles the PDF check internally, so no need for it here
                    display_name = doc_data.get('name_en', None) 
                    firestore_pdf_infos.append({"url": pdf_url, "name": display_name})
                else:
                    documents_skipped_non_pdf_count += 1
                    print(f"  DEBUG: Document ID: {doc.id} does not contain 'fileUrl'. Document data: {doc.data}")

            if documents_processed_count == 0:
                print("No documents found in Firestore collection 'documents' via stream(). Please check collection name and security rules.")
            elif documents_processed_count > 0 and not firestore_pdf_infos:
                print(f"Found {documents_processed_count} documents in Firestore, but none matched the '.pdf' criteria or had 'fileUrl'.")
            elif documents_skipped_non_pdf_count > 0:
                print(f"Found {documents_processed_count} documents in Firestore. {len(firestore_pdf_infos)} URLs found, {documents_skipped_non_pdf_count} documents skipped (non-URL or non-PDF by add_document).")


            if firestore_pdf_infos:
                for pdf_info in firestore_pdf_infos:
                    # rag_system.add_document will internally check for PDF extension
                    rag_system.add_document(pdf_info['url'], pdf_info['name']) 
            else:
                pass 
                
        except Exception as e:
            print(f"Error fetching documents from Firestore: {e}")
            print("Please ensure your Firestore database is accessible and the service account key is correct.")
    else:
        print("Firestore client not initialized. Cannot fetch documents from Firestore.")
        print("Using local PDF_DOCUMENT_PATHS as a fallback for testing purposes (ensure these files exist).")
        # This import is moved here to avoid circular dependency if config imports rag_system
        from config import PDF_DOCUMENT_PATHS # This path is for local testing only
        for pdf_path in PDF_DOCUMENT_PATHS:
            if os.path.exists(pdf_path):
                rag_system.add_document(pdf_path)
            else:
                print(f"Error: Local PDF file not found at {pdf_path}. Skipping.")


    print("\n--- Chat With CompassIA (Type 'q' to exit) ---")
    current_conversation_id = str(uuid.uuid4()) 
    # For local testing, we'll use a static user ID. In a real app, this would come from authentication.
    current_user_id = "local_test_user_123" 
    print(f"Starting new local conversation with ID: {current_conversation_id} for user: {current_user_id}")

    while True:
        user_question = input("\nHow can I help you? ")
        if user_question.lower() == 'q':
            print("Exiting chat...")
            break
        
        # Pass both conversation ID and user ID to the answer_question method
        answer_text, _ = rag_system.answer_question(user_question, conversation_id=current_conversation_id, user_id=current_user_id)
        # For local testing, we print the answer directly
        print(f"\nAI: {answer_text}")