File size: 6,152 Bytes
df5a316
1c2a47a
 
 
df5a316
 
 
1c2a47a
df5a316
1c2a47a
f91e3a6
 
df5a316
414dfd0
 
 
 
15d9931
 
df5a316
6b2f911
1c2a47a
f91e3a6
 
6b2f911
15d9931
 
 
 
a2967ae
 
f91e3a6
 
6b2f911
a2967ae
 
414dfd0
15d9931
f91e3a6
 
 
414dfd0
 
f91e3a6
414dfd0
 
 
15d9931
f91e3a6
 
414dfd0
 
 
f91e3a6
 
 
15d9931
f91e3a6
15d9931
f91e3a6
6b2f911
f91e3a6
15d9931
 
6b2f911
a2967ae
f91e3a6
15d9931
1c2a47a
f91e3a6
6b2f911
 
15d9931
f91e3a6
 
 
15d9931
f91e3a6
15d9931
f91e3a6
 
 
15d9931
6b2f911
 
15d9931
 
414dfd0
 
 
 
 
d607228
f91e3a6
414dfd0
 
 
 
 
 
a2967ae
f91e3a6
414dfd0
 
a032c74
 
414dfd0
15d9931
a2967ae
0f574db
414dfd0
 
0f574db
a2967ae
0f574db
 
a2967ae
 
0f574db
f91e3a6
0f574db
 
f91e3a6
414dfd0
a032c74
f91e3a6
414dfd0
 
 
 
a2967ae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import sys
# CRITICAL: These lines MUST be the absolute first executable lines in app.py.
# This ensures pysqlite3 is loaded and replaces the standard sqlite3 module
# in sys.modules before any other module (like chromadb) attempts to import sqlite3.
try:
    import pysqlite3
    sys.modules['sqlite3'] = pysqlite3
    print("pysqlite3 successfully imported and set as default sqlite3 module.")
except ImportError:
    print("ERROR: pysqlite3-binary could not be imported. ChromaDB will likely fail due to old sqlite3 version.")
    # In a production environment, you might want to raise an exception here
    # to prevent the application from starting if this critical dependency fails.

import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
import json
import base64
from dotenv import load_dotenv

# Load environment variables (after pysqlite3 fix)
# This ensures that variables like FIREBASE_CONFIG_BASE64 are available
# before other modules (like config.py) attempt to read them.
load_dotenv() 

# Add the 'src' directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))

# Now import components from src.compassia (should be src.rag_system as per earlier conversation,
# but keeping 'compassia' as per your provided code for this response)
# We import initialize_firebase_client as we call it here.
# DocumentRAG and embedding_model are needed for instantiating the RAG system.
from src.config import CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME
from src.compassia import DocumentRAG, embedding_model, initialize_firebase_client # Corrected import to compassia


# --- Firebase Initialization (Global, once per process) ---
# Call the initialization function and CAPTURE THE RETURNED FIRESTORE CLIENT INSTANCE.
# This instance will be passed to the DocumentRAG to ensure correct Firestore access.
FIRESTORE_DB_INSTANCE = initialize_firebase_client()

# --- Initialize the RAG system globally ---
# This happens once when the FastAPI app starts.
print("--- FastAPI App Startup: Initializing RAG System ---")
rag_system = DocumentRAG(
    embedding_model=embedding_model,
    persist_directory=CHROMADB_PERSIST_DIRECTORY,
    collection_name=CHROMADB_COLLECTION_NAME,
    firestore_db_instance=FIRESTORE_DB_INSTANCE # CRITICAL: Pass the initialized Firestore instance here
)

# --- Index documents on startup ---
# This loop will run when the FastAPI app first starts.
# It uses ChromaDB's persistence, so documents already indexed will be skipped.
# It now correctly uses the captured FIRESTORE_DB_INSTANCE.
print("--- FastAPI App Startup: Indexing Documents from Firestore ---")
if FIRESTORE_DB_INSTANCE:
    try:
        docs_ref = FIRESTORE_DB_INSTANCE.collection('documents').stream()
        
        documents_to_process = []
        for doc in docs_ref:
            doc_data = doc.to_dict()
            if 'fileUrl' in doc_data:
                # The add_document method in rag_system.py now handles PDF filtering
                # so we just pass the URL and optional display name.
                pdf_url = doc_data['fileUrl']
                display_name = doc_data.get('name_en', None) 
                documents_to_process.append({"url": pdf_url, "name": display_name})
            else:
                print(f"Skipping document ID: {doc.id} - 'fileUrl' field missing.")

        if documents_to_process:
            for doc_info in documents_to_process:
                rag_system.add_document(doc_info['url'], doc_info['name'])
        else:
            print("No documents with 'fileUrl' found in Firestore collection 'documents' to index.")
    except Exception as e:
        print(f"API Error: Error fetching documents from Firestore during startup: {e}")
        print("Please ensure your Firestore database is accessible and the service account key (FIREBASE_CONFIG_BASE64 secret) is correctly set in your Hugging Face Space secrets.")
        # Decide if app should crash or continue. For now, it will print error but continue.
else:
    print("API Error: Firestore client not initialized. Cannot fetch documents from Firestore on startup.")
    print("Ensure FIREBASE_CONFIG_BASE64 secret is correctly set in your Hugging Face Space secrets.")


print("--- FastAPI App Startup: Document indexing complete ---")


# --- FastAPI Application Instance ---
app = FastAPI(
    title="CompassIA",
    description="Backend API for querying PDFs using DeepSeek (via OpenRouter) and BGE-M3 embeddings, with conversational memory and document indexing from Firestore.",
    version="0.1.0",
)

# Pydantic model for request body validation
class QueryRequest(BaseModel):
    question: str
    user_id: str # Added: user_id is now a required field for every request
    conversation_id: str = None # Optional: client can provide an ID for ongoing conversations

# --- API Endpoint Definition ---
@app.post("/compassia/")
async def compassia_endpoint(request: QueryRequest):
    """
    Answers a question about the indexed PDF documents using RAG, with conversational memory.
    Requires a user_id from the authenticated user.
    If `conversation_id` is not provided, a new one will be generated and returned in the response.
    """
    try:
        # Call answer_question which now returns a tuple (answer_text, conversation_id)
        # Pass the user_id from the request
        answer_text, final_conversation_id = rag_system.answer_question(
            request.question,
            conversation_id=request.conversation_id,
            user_id=request.user_id # Passed: The user_id is now sent to the RAG system
        )
        
        # Return both the answer and the (potentially new) conversation_id to the client
        return {"answer": answer_text, "conversation_id": final_conversation_id}

    except Exception as e:
        print(f"Error processing /compassia/ request: {e}")
        raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")

# Basic health check endpoint
@app.get("/")
async def root():
    return {"message": "CompassIA API is running. Use /compassia/ for queries."}