Spaces:
Runtime error
Runtime error
Commit ·
15d9931
1
Parent(s): d607228
Reorganized the project: -Documents hosted on Firestore db, -conversations saved
Browse files- .gitattributes +0 -1
- .gitignore +7 -1
- README.md +1 -1
- app.py +51 -34
- compassia.py +0 -385
- documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf +0 -3
- documents/heracles_en.pdf +0 -3
- documents/heracles_tr.pdf +0 -3
- documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf +0 -3
- documents/tmv-bursluluk-yonergesi.pdf +0 -3
- requirements.txt +3 -1
- src/compassia.py +464 -0
- src/config.py +59 -0
- src/pdf_processing.py +136 -0
- src/prompt.py +20 -0
.gitattributes
CHANGED
|
@@ -1,2 +1 @@
|
|
| 1 |
documents/*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
chroma_db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
documents/*.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
.gitignore
CHANGED
|
@@ -6,4 +6,10 @@ all-libraries.txt
|
|
| 6 |
# Ignore ChromaDB persistent storage
|
| 7 |
chroma_db/
|
| 8 |
|
| 9 |
-
temp*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
# Ignore ChromaDB persistent storage
|
| 7 |
chroma_db/
|
| 8 |
|
| 9 |
+
temp*
|
| 10 |
+
|
| 11 |
+
__pycache__/
|
| 12 |
+
|
| 13 |
+
atemp/
|
| 14 |
+
|
| 15 |
+
documents/
|
README.md
CHANGED
|
@@ -24,7 +24,7 @@ It uses:
|
|
| 24 |
## How to Use:
|
| 25 |
This Space exposes a `/compassia/` API endpoint. You can interact with it using `curl`, Postman, Insomnia, or by integrating it with your Next.js frontend.
|
| 26 |
|
| 27 |
-
### API Endpoint: `/
|
| 28 |
**Request Body (JSON):**
|
| 29 |
```json
|
| 30 |
{
|
|
|
|
| 24 |
## How to Use:
|
| 25 |
This Space exposes a `/compassia/` API endpoint. You can interact with it using `curl`, Postman, Insomnia, or by integrating it with your Next.js frontend.
|
| 26 |
|
| 27 |
+
### API Endpoint: `/compassia/` (POST request)
|
| 28 |
**Request Body (JSON):**
|
| 29 |
```json
|
| 30 |
{
|
app.py
CHANGED
|
@@ -1,67 +1,90 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from pydantic import BaseModel
|
| 6 |
import uvicorn
|
| 7 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
#
|
| 11 |
-
#
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
# Import your DocumentRAG class and other necessary components from your backend script
|
| 14 |
-
# Make sure your rag_backend.py has `embedding_model` defined globally or passed correctly
|
| 15 |
-
from compassia import DocumentRAG, embedding_model, pdf_document_paths, extract_text_from_pdf, ocr_pdf, chunk_text # Import all needed functions/variables
|
| 16 |
|
| 17 |
# --- Initialize the RAG system globally ---
|
| 18 |
-
# This ensures the model loads and indexing happens once when the FastAPI app starts
|
| 19 |
-
# and persists across requests within the same process.
|
| 20 |
-
# ChromaDB will save its data to the './chroma_db' directory within the Space.
|
| 21 |
print("--- FastAPI App Startup: Initializing RAG System ---")
|
| 22 |
rag_system = DocumentRAG(
|
| 23 |
embedding_model=embedding_model,
|
| 24 |
-
persist_directory=
|
| 25 |
-
collection_name=
|
| 26 |
-
chunk_size=700, # Match your existing chunk size
|
| 27 |
-
overlap=100 # Match your existing overlap
|
| 28 |
)
|
| 29 |
|
| 30 |
# --- Index documents on startup ---
|
| 31 |
# This loop will run when the FastAPI app first starts.
|
| 32 |
# It uses ChromaDB's persistence, so documents already indexed will be skipped.
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
print("--- FastAPI App Startup: Document indexing complete ---")
|
| 42 |
|
| 43 |
|
| 44 |
# --- FastAPI Application Instance ---
|
| 45 |
app = FastAPI(
|
| 46 |
title="CompassIA",
|
| 47 |
-
description="Backend API for querying PDFs using DeepSeek (via OpenRouter) and BGE-M3 embeddings.",
|
| 48 |
version="0.1.0",
|
| 49 |
)
|
| 50 |
|
| 51 |
# Pydantic model for request body validation
|
| 52 |
class QueryRequest(BaseModel):
|
| 53 |
question: str
|
|
|
|
| 54 |
|
| 55 |
# --- API Endpoint Definition ---
|
| 56 |
@app.post("/compassia/")
|
| 57 |
async def compassia_endpoint(request: QueryRequest):
|
| 58 |
"""
|
| 59 |
-
Answers a question about the indexed PDF documents using RAG.
|
| 60 |
"""
|
| 61 |
try:
|
| 62 |
-
# Pass
|
| 63 |
-
answer = rag_system.answer_question(request.question,
|
| 64 |
-
return {"answer": answer}
|
| 65 |
except Exception as e:
|
| 66 |
print(f"Error processing /compassia/ request: {e}")
|
| 67 |
raise HTTPException(status_code=500, detail=str(e))
|
|
@@ -70,9 +93,3 @@ async def compassia_endpoint(request: QueryRequest):
|
|
| 70 |
@app.get("/")
|
| 71 |
async def root():
|
| 72 |
return {"message": "CompassIA API is running. Use /compassia/ for queries."}
|
| 73 |
-
|
| 74 |
-
# You can run this locally for testing:
|
| 75 |
-
# if __name__ == "__main__":
|
| 76 |
-
# # This part runs locally if you execute app.py directly
|
| 77 |
-
# # For deployment, uvicorn is typically run via a command line.
|
| 78 |
-
# uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
from pydantic import BaseModel
|
| 4 |
import uvicorn
|
| 5 |
import sys
|
| 6 |
+
import json
|
| 7 |
+
import base64
|
| 8 |
+
|
| 9 |
+
# Add the 'src' directory to the Python path
|
| 10 |
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), 'src')))
|
| 11 |
+
|
| 12 |
+
# Import components from the new modular structure, specifically from src.compassia
|
| 13 |
+
from src.config import CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME # Removed PDF_DOCUMENT_PATHS as it's handled in rag_system
|
| 14 |
+
from src.compassia import DocumentRAG, embedding_model, initialize_firebase_client, db_firestore # Import db_firestore and related functions from compassia.py
|
| 15 |
|
| 16 |
+
# --- Firebase Initialization (Global, once per process) ---
|
| 17 |
+
# Initialize Firebase Admin SDK using a secret from Hugging Face Spaces
|
| 18 |
+
# This function is now called directly from app.py startup.
|
| 19 |
+
# db_firestore is already imported and will be set by initialize_firebase_client()
|
| 20 |
+
initialize_firebase_client()
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# --- Initialize the RAG system globally ---
|
|
|
|
|
|
|
|
|
|
| 24 |
print("--- FastAPI App Startup: Initializing RAG System ---")
|
| 25 |
rag_system = DocumentRAG(
|
| 26 |
embedding_model=embedding_model,
|
| 27 |
+
persist_directory=CHROMADB_PERSIST_DIRECTORY,
|
| 28 |
+
collection_name=CHROMADB_COLLECTION_NAME
|
|
|
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
# --- Index documents on startup ---
|
| 32 |
# This loop will run when the FastAPI app first starts.
|
| 33 |
# It uses ChromaDB's persistence, so documents already indexed will be skipped.
|
| 34 |
+
# Now fetches document URLs directly from Firestore using db_firestore
|
| 35 |
+
print("--- FastAPI App Startup: Indexing Documents from Firestore ---")
|
| 36 |
+
if db_firestore:
|
| 37 |
+
try:
|
| 38 |
+
docs_ref = db_firestore.collection('documents').stream()
|
| 39 |
+
firestore_pdf_infos = []
|
| 40 |
+
for doc in docs_ref:
|
| 41 |
+
doc_data = doc.to_dict()
|
| 42 |
+
if 'fileUrl' in doc_data and doc_data['fileUrl'].endswith('.pdf'):
|
| 43 |
+
pdf_url = doc_data['fileUrl']
|
| 44 |
+
display_name = doc_data.get('name_en', os.path.basename(pdf_url))
|
| 45 |
+
firestore_pdf_infos.append({"url": pdf_url, "name": display_name})
|
| 46 |
+
print(f"Found PDF in Firestore: {display_name} ({pdf_url})")
|
| 47 |
+
|
| 48 |
+
if firestore_pdf_infos:
|
| 49 |
+
for pdf_info in firestore_pdf_infos:
|
| 50 |
+
rag_system.add_document(pdf_info['url'], pdf_info['name'])
|
| 51 |
+
else:
|
| 52 |
+
print("No PDF documents found in Firestore collection 'documents'.")
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"API Error: Error fetching documents from Firestore: {e}")
|
| 55 |
+
print("Please ensure your Firestore database is accessible and the service account key (FIREBASE_CONFIG_BASE64 secret) is correct.")
|
| 56 |
+
# If document fetching fails, consider if the app should still start or crash.
|
| 57 |
+
# For now, it will print the error but continue to try to start the API.
|
| 58 |
+
else:
|
| 59 |
+
print("API Error: Firestore not initialized. Cannot fetch documents from Firestore on startup.")
|
| 60 |
+
print("Ensure FIREBASE_CONFIG_BASE64 secret is correctly set.")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
print("--- FastAPI App Startup: Document indexing complete ---")
|
| 64 |
|
| 65 |
|
| 66 |
# --- FastAPI Application Instance ---
|
| 67 |
app = FastAPI(
|
| 68 |
title="CompassIA",
|
| 69 |
+
description="Backend API for querying PDFs using DeepSeek (via OpenRouter) and BGE-M3 embeddings, with conversational memory.",
|
| 70 |
version="0.1.0",
|
| 71 |
)
|
| 72 |
|
| 73 |
# Pydantic model for request body validation
|
| 74 |
class QueryRequest(BaseModel):
|
| 75 |
question: str
|
| 76 |
+
conversation_id: str = None # Optional for new conversations
|
| 77 |
|
| 78 |
# --- API Endpoint Definition ---
|
| 79 |
@app.post("/compassia/")
|
| 80 |
async def compassia_endpoint(request: QueryRequest):
|
| 81 |
"""
|
| 82 |
+
Answers a question about the indexed PDF documents using RAG, with conversational memory.
|
| 83 |
"""
|
| 84 |
try:
|
| 85 |
+
# Pass conversation_id to the answer_question function
|
| 86 |
+
answer = rag_system.answer_question(request.question, conversation_id=request.conversation_id)
|
| 87 |
+
return {"answer": answer, "conversation_id": request.conversation_id}
|
| 88 |
except Exception as e:
|
| 89 |
print(f"Error processing /compassia/ request: {e}")
|
| 90 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 93 |
@app.get("/")
|
| 94 |
async def root():
|
| 95 |
return {"message": "CompassIA API is running. Use /compassia/ for queries."}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
compassia.py
DELETED
|
@@ -1,385 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
# IMPORTANT: These lines MUST be at the very top of compassia.py
|
| 3 |
-
# They ensure that any subsequent import of 'sqlite3' (even indirectly by chromadb)
|
| 4 |
-
# will use the version provided by pysqlite3-binary.
|
| 5 |
-
try:
|
| 6 |
-
import pysqlite3
|
| 7 |
-
sys.modules['sqlite3'] = pysqlite3
|
| 8 |
-
except ImportError:
|
| 9 |
-
pass # Fallback if pysqlite3 isn't available, but it should be in Docker
|
| 10 |
-
|
| 11 |
-
import requests
|
| 12 |
-
import os
|
| 13 |
-
import io
|
| 14 |
-
import re
|
| 15 |
-
import uuid # For generating unique IDs for ChromaDB
|
| 16 |
-
from PIL import Image
|
| 17 |
-
|
| 18 |
-
# For text extraction from PDFs (non-OCR)
|
| 19 |
-
from pdfminer.high_level import extract_text_to_fp
|
| 20 |
-
from pdfminer.layout import LAParams
|
| 21 |
-
|
| 22 |
-
# For image-based PDFs (OCR)
|
| 23 |
-
from pdf2image import convert_from_path
|
| 24 |
-
import pytesseract
|
| 25 |
-
|
| 26 |
-
# For embeddings and vector search
|
| 27 |
-
from FlagEmbedding import BGEM3FlagModel
|
| 28 |
-
import chromadb # pip install chromadb
|
| 29 |
-
|
| 30 |
-
# --- IMPORTANT: Configure Paths for Tesseract and Poppler ---
|
| 31 |
-
# If Tesseract is not in your system's PATH, uncomment and set this locally:
|
| 32 |
-
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
| 33 |
-
|
| 34 |
-
# If pdf2image gives errors about poppler, uncomment and set this locally:
|
| 35 |
-
# poppler_path = r'C:\path\to\poppler\bin'
|
| 36 |
-
|
| 37 |
-
# --- OpenRouter DeepSeek API Configuration ---
|
| 38 |
-
API_KEY = os.getenv("DEEPSEEK_R1_V3_API_KEY")
|
| 39 |
-
if API_KEY:
|
| 40 |
-
API_KEY = API_KEY.strip()
|
| 41 |
-
|
| 42 |
-
if not API_KEY:
|
| 43 |
-
raise ValueError("API key is not set. Please set the DEEPSEEK_R1_V3_API_KEY environment variable with your OpenRouter key.")
|
| 44 |
-
|
| 45 |
-
API_URL = 'https://openrouter.ai/api/v1/chat/completions'
|
| 46 |
-
HEADERS = {
|
| 47 |
-
'Authorization': f'Bearer {API_KEY}',
|
| 48 |
-
'Content-Type': 'application/json'
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
# --- Embedding Model Configuration (Local BGE-M3) ---
|
| 52 |
-
print("Loading FlagEmbedding (BGE-M3) model...")
|
| 53 |
-
try:
|
| 54 |
-
embedding_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
|
| 55 |
-
print("FlagEmbedding (BGE-M3) model loaded successfully.")
|
| 56 |
-
except Exception as e:
|
| 57 |
-
print(f"Error loading FlagEmbedding model: {e}")
|
| 58 |
-
print("Ensure you have resolved disk space issues for model download and have enough memory.")
|
| 59 |
-
print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
|
| 60 |
-
exit(1)
|
| 61 |
-
|
| 62 |
-
# --- PDF Processing Functions ---
|
| 63 |
-
|
| 64 |
-
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 65 |
-
"""
|
| 66 |
-
Extracts text from a PDF. Tries direct text extraction first.
|
| 67 |
-
If sparse text is found (suggesting image-based PDF), it performs OCR.
|
| 68 |
-
"""
|
| 69 |
-
print(f"Attempting direct text extraction from: {pdf_path}")
|
| 70 |
-
output_string = io.StringIO()
|
| 71 |
-
with open(pdf_path, 'rb') as fp:
|
| 72 |
-
try:
|
| 73 |
-
extract_text_to_fp(fp, output_string, laparams=LAParams())
|
| 74 |
-
text = output_string.getvalue()
|
| 75 |
-
if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
|
| 76 |
-
print("Direct extraction yielded sparse text. Attempting OCR...")
|
| 77 |
-
return ocr_pdf(pdf_path)
|
| 78 |
-
return text
|
| 79 |
-
except Exception as e:
|
| 80 |
-
print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
|
| 81 |
-
return ocr_pdf(pdf_path)
|
| 82 |
-
|
| 83 |
-
def ocr_pdf(pdf_path: str) -> str:
|
| 84 |
-
"""
|
| 85 |
-
Performs OCR on a PDF file using pdf2image and pytesseract.
|
| 86 |
-
Requires Tesseract and Poppler to be installed and in system PATH.
|
| 87 |
-
"""
|
| 88 |
-
all_text = []
|
| 89 |
-
try:
|
| 90 |
-
images = convert_from_path(pdf_path, dpi=300)
|
| 91 |
-
|
| 92 |
-
print(f" Performing OCR on {len(images)} pages...")
|
| 93 |
-
for i, img in enumerate(images):
|
| 94 |
-
# Tesseract language packs:
|
| 95 |
-
# 'eng' for English, 'tur' for Turkish
|
| 96 |
-
# If you have scanned PDFs in Arabic or French, you MUST install
|
| 97 |
-
# 'tesseract-ocr-ara' and 'tesseract-ocr-fra' in your Dockerfile
|
| 98 |
-
# and change 'lang' to 'eng+tur+ara+fra'.
|
| 99 |
-
page_text = pytesseract.image_to_string(img, lang='eng+tur')
|
| 100 |
-
all_text.append(page_text)
|
| 101 |
-
print(f" Page {i+1} OCR complete.")
|
| 102 |
-
|
| 103 |
-
except Exception as e:
|
| 104 |
-
print(f"OCR process failed: {e}")
|
| 105 |
-
print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
|
| 106 |
-
return ""
|
| 107 |
-
|
| 108 |
-
return "\n".join(all_text)
|
| 109 |
-
|
| 110 |
-
def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
|
| 111 |
-
"""
|
| 112 |
-
Splits text into chunks of a maximum size with optional overlap.
|
| 113 |
-
Aims to split by paragraphs/sentences first, then by word.
|
| 114 |
-
"""
|
| 115 |
-
if not text:
|
| 116 |
-
return []
|
| 117 |
-
|
| 118 |
-
paragraphs = re.split(r'\n\s*\n', text)
|
| 119 |
-
chunks = []
|
| 120 |
-
current_chunk = []
|
| 121 |
-
current_chunk_len = 0
|
| 122 |
-
|
| 123 |
-
for para in paragraphs:
|
| 124 |
-
if not para.strip():
|
| 125 |
-
continue
|
| 126 |
-
|
| 127 |
-
if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
|
| 128 |
-
if current_chunk:
|
| 129 |
-
chunks.append("\n\n".join(current_chunk))
|
| 130 |
-
current_chunk = []
|
| 131 |
-
current_chunk_len = 0
|
| 132 |
-
|
| 133 |
-
if len(para) > max_chunk_size:
|
| 134 |
-
words = para.split(' ')
|
| 135 |
-
sub_chunk = []
|
| 136 |
-
sub_chunk_len = 0
|
| 137 |
-
for word in words:
|
| 138 |
-
if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
|
| 139 |
-
chunks.append(" ".join(sub_chunk))
|
| 140 |
-
sub_chunk = [word]
|
| 141 |
-
sub_chunk_len = len(word)
|
| 142 |
-
else:
|
| 143 |
-
sub_chunk.append(word)
|
| 144 |
-
sub_chunk_len += len(word) + len(' ')
|
| 145 |
-
if sub_chunk:
|
| 146 |
-
chunks.append(" ".join(sub_chunk))
|
| 147 |
-
else:
|
| 148 |
-
current_chunk.append(para)
|
| 149 |
-
current_chunk_len += len(para) + len('\n\n')
|
| 150 |
-
else:
|
| 151 |
-
current_chunk.append(para)
|
| 152 |
-
current_chunk_len += len(para) + len('\n\n')
|
| 153 |
-
|
| 154 |
-
if current_chunk:
|
| 155 |
-
chunks.append("\n\n".join(current_chunk))
|
| 156 |
-
|
| 157 |
-
final_chunks_with_overlap = []
|
| 158 |
-
for i in range(len(chunks)):
|
| 159 |
-
chunk = chunks[i]
|
| 160 |
-
if i > 0 and overlap > 0:
|
| 161 |
-
prev_chunk_part = chunks[i-1][-overlap:]
|
| 162 |
-
chunk = prev_chunk_part + "\n" + chunk
|
| 163 |
-
final_chunks_with_overlap.append(chunk)
|
| 164 |
-
|
| 165 |
-
return final_chunks_with_overlap
|
| 166 |
-
|
| 167 |
-
# --- RAG Core Functions with ChromaDB ---
|
| 168 |
-
|
| 169 |
-
class DocumentRAG:
|
| 170 |
-
def __init__(self, embedding_model, persist_directory="./chroma_db", collection_name="pdf_docs", chunk_size=700, overlap=100):
|
| 171 |
-
self.embedding_model = embedding_model
|
| 172 |
-
self.chunk_size = chunk_size
|
| 173 |
-
self.overlap = overlap
|
| 174 |
-
self.persist_directory = persist_directory
|
| 175 |
-
self.collection_name = collection_name
|
| 176 |
-
|
| 177 |
-
print(f"Initializing ChromaDB at: {self.persist_directory}")
|
| 178 |
-
self.client = chromadb.PersistentClient(path=self.persist_directory)
|
| 179 |
-
|
| 180 |
-
self.collection = self.client.get_or_create_collection(
|
| 181 |
-
name=self.collection_name,
|
| 182 |
-
metadata={"hnsw:space": "cosine"}
|
| 183 |
-
)
|
| 184 |
-
print(f"ChromaDB collection '{self.collection_name}' ready.")
|
| 185 |
-
|
| 186 |
-
def _generate_chunk_id(self, pdf_path: str, chunk_idx: int) -> str:
|
| 187 |
-
return f"{os.path.basename(pdf_path)}_{chunk_idx}_{uuid.uuid4().hex}"
|
| 188 |
-
|
| 189 |
-
def add_document(self, pdf_path: str):
|
| 190 |
-
print(f"Adding document: {pdf_path}")
|
| 191 |
-
|
| 192 |
-
results = self.collection.get(
|
| 193 |
-
where={"source": pdf_path},
|
| 194 |
-
limit=1
|
| 195 |
-
)
|
| 196 |
-
if results and results['ids']:
|
| 197 |
-
print(f" Document '{pdf_path}' already in ChromaDB. Skipping re-indexing.")
|
| 198 |
-
return
|
| 199 |
-
|
| 200 |
-
extracted_text = extract_text_from_pdf(pdf_path)
|
| 201 |
-
if not extracted_text:
|
| 202 |
-
print(f"Warning: No text extracted from {pdf_path}. Skipping.")
|
| 203 |
-
return
|
| 204 |
-
|
| 205 |
-
chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
|
| 206 |
-
if not chunks:
|
| 207 |
-
print(f"Warning: No chunks generated for {pdf_path}. Skipping.")
|
| 208 |
-
return
|
| 209 |
-
|
| 210 |
-
documents_to_add = []
|
| 211 |
-
metadatas_to_add = []
|
| 212 |
-
ids_to_add = []
|
| 213 |
-
|
| 214 |
-
print(f" Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB...")
|
| 215 |
-
|
| 216 |
-
encoded_results = self.embedding_model.encode(
|
| 217 |
-
chunks,
|
| 218 |
-
batch_size=32,
|
| 219 |
-
return_dense=True,
|
| 220 |
-
return_sparse=False,
|
| 221 |
-
return_colbert_vecs=False
|
| 222 |
-
)
|
| 223 |
-
|
| 224 |
-
chunk_embeddings = encoded_results["dense_vecs"]
|
| 225 |
-
|
| 226 |
-
for i, chunk in enumerate(chunks):
|
| 227 |
-
unique_id = self._generate_chunk_id(pdf_path, i)
|
| 228 |
-
documents_to_add.append(chunk)
|
| 229 |
-
metadatas_to_add.append({"source": pdf_path, "chunk_id": i})
|
| 230 |
-
ids_to_add.append(unique_id)
|
| 231 |
-
|
| 232 |
-
self.collection.add(
|
| 233 |
-
documents=documents_to_add,
|
| 234 |
-
embeddings=chunk_embeddings.tolist(),
|
| 235 |
-
metadatas=metadatas_to_add,
|
| 236 |
-
ids=ids_to_add
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
-
print(f" {len(documents_to_add)} chunks from '{pdf_path}' added to ChromaDB.")
|
| 240 |
-
print(f" Total chunks in collection: {self.collection.count()}")
|
| 241 |
-
|
| 242 |
-
def retrieve_context(self, query: str, top_k: int = 3) -> list[str]:
|
| 243 |
-
"""
|
| 244 |
-
Retrieves top_k most relevant document chunks for a given query from ChromaDB.
|
| 245 |
-
"""
|
| 246 |
-
if self.collection.count() == 0:
|
| 247 |
-
print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
|
| 248 |
-
return []
|
| 249 |
-
|
| 250 |
-
print(f"Retrieving context for query: '{query}'")
|
| 251 |
-
|
| 252 |
-
query_embedding_result = self.embedding_model.encode(
|
| 253 |
-
[query],
|
| 254 |
-
batch_size=1,
|
| 255 |
-
return_dense=True,
|
| 256 |
-
return_sparse=False,
|
| 257 |
-
return_colbert_vecs=False
|
| 258 |
-
)
|
| 259 |
-
query_embedding = query_embedding_result["dense_vecs"].tolist()
|
| 260 |
-
|
| 261 |
-
results = self.collection.query(
|
| 262 |
-
query_embeddings=query_embedding,
|
| 263 |
-
n_results=top_k,
|
| 264 |
-
include=['documents', 'distances', 'metadatas']
|
| 265 |
-
)
|
| 266 |
-
|
| 267 |
-
retrieved_chunks_texts = []
|
| 268 |
-
if results and results['documents']:
|
| 269 |
-
for i, doc_text in enumerate(results['documents'][0]):
|
| 270 |
-
source_info = results['metadatas'][0][i].get('source', 'Unknown Source')
|
| 271 |
-
chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
|
| 272 |
-
distance_info = results['distances'][0][i]
|
| 273 |
-
|
| 274 |
-
retrieved_chunks_texts.append(doc_text)
|
| 275 |
-
print(f" Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{source_info}' (chunk {chunk_id_info}).")
|
| 276 |
-
else:
|
| 277 |
-
print(" No relevant chunks found in ChromaDB.")
|
| 278 |
-
|
| 279 |
-
return retrieved_chunks_texts
|
| 280 |
-
|
| 281 |
-
def answer_question(self, question: str, pdf_paths: list[str]) -> str:
|
| 282 |
-
"""
|
| 283 |
-
Answers a question by ensuring PDFs are indexed, retrieving context,
|
| 284 |
-
and querying DeepSeek.
|
| 285 |
-
"""
|
| 286 |
-
for path in pdf_paths:
|
| 287 |
-
self.add_document(path)
|
| 288 |
-
|
| 289 |
-
context_chunks = self.retrieve_context(question)
|
| 290 |
-
context = "\n\n".join(context_chunks)
|
| 291 |
-
|
| 292 |
-
if not context:
|
| 293 |
-
print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")
|
| 294 |
-
context_prompt = ""
|
| 295 |
-
else:
|
| 296 |
-
context_prompt = f"Using the following context:\n\n{context}\n\n"
|
| 297 |
-
|
| 298 |
-
# --- UPDATED SYSTEM PROMPT FOR COMPASSIA AI ---
|
| 299 |
-
system_prompt = """
|
| 300 |
-
You are CompassIA, the intelligent assistant for MaarifCompass, committed to supporting Turkiye Maarif Foundation graduates residing in Turkiye.
|
| 301 |
-
|
| 302 |
-
Your core function is to deliver precise, document-backed information concerning their needs, primarily focusing on:
|
| 303 |
-
- University application procedures, requirements, tuition fees, and scholarship opportunities
|
| 304 |
-
- Accommodation and housing resources
|
| 305 |
-
- Career networking and professional development
|
| 306 |
-
- Relevant administrative and support services.
|
| 307 |
-
- Information related to Turkiye Maarif Foundation, Turkiye Scholarship and more.
|
| 308 |
-
|
| 309 |
-
You operate exclusively with data from a designated Document Center. **It is imperative that every piece of information you provide is directly sourced and verifiable from these internal documents.**
|
| 310 |
-
|
| 311 |
-
**Should a query fall outside the scope of the provided documents or lack a direct answer within them, you are required to politely inform the user that the specific information is not available in your current knowledge base, without offering any external insights or assumptions.**
|
| 312 |
-
|
| 313 |
-
Your answers should be highly accurate, directly relevant, easy to understand, and always prioritize the user's query based strictly on documented facts.
|
| 314 |
-
**Remember, you always answer the user with the language of the question.**
|
| 315 |
-
"""
|
| 316 |
-
|
| 317 |
-
messages = [
|
| 318 |
-
{"role": "system", "content": system_prompt},
|
| 319 |
-
{"role": "user", "content": f"{context_prompt}Question: {question}"}
|
| 320 |
-
]
|
| 321 |
-
|
| 322 |
-
print("\nSending request to DeepSeek API...")
|
| 323 |
-
data = {
|
| 324 |
-
"model": "deepseek/deepseek-chat:free",
|
| 325 |
-
"messages": messages,
|
| 326 |
-
"temperature": 0.5,
|
| 327 |
-
"max_tokens": 500,
|
| 328 |
-
}
|
| 329 |
-
|
| 330 |
-
response = requests.post(API_URL, json=data, headers=HEADERS)
|
| 331 |
-
|
| 332 |
-
if response.status_code == 200:
|
| 333 |
-
ai_response = response.json()
|
| 334 |
-
answer = ai_response['choices'][0]['message']['content']
|
| 335 |
-
print("\nDeepSeek Response:")
|
| 336 |
-
print(answer)
|
| 337 |
-
return answer
|
| 338 |
-
else:
|
| 339 |
-
error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
|
| 340 |
-
print(error_message)
|
| 341 |
-
return f"Error: Could not get an answer from the AI. Details: {error_message}"
|
| 342 |
-
|
| 343 |
-
# --- Define PDF documents (MOVED TO GLOBAL SCOPE) ---
|
| 344 |
-
pdf_document_paths = [
|
| 345 |
-
"documents/heracles_tr.pdf",
|
| 346 |
-
"documents/heracles_en.pdf",
|
| 347 |
-
"documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf",
|
| 348 |
-
"documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf",
|
| 349 |
-
"documents/tmv-bursluluk-yonergesi.pdf"
|
| 350 |
-
]
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
# --- Main execution logic ---
|
| 354 |
-
if __name__ == "__main__":
|
| 355 |
-
rag_system = DocumentRAG(
|
| 356 |
-
embedding_model=embedding_model,
|
| 357 |
-
persist_directory="./chroma_db",
|
| 358 |
-
collection_name="pdf_documents_collection",
|
| 359 |
-
chunk_size=700,
|
| 360 |
-
overlap=100
|
| 361 |
-
)
|
| 362 |
-
"""
|
| 363 |
-
pdf_document_paths = [
|
| 364 |
-
"documents/heracles_tr.pdf",
|
| 365 |
-
"documents/heracles_en.pdf",
|
| 366 |
-
"documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf",
|
| 367 |
-
"documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf",
|
| 368 |
-
"documents/tmv-bursluluk-yonergesi.pdf"
|
| 369 |
-
]
|
| 370 |
-
"""
|
| 371 |
-
print("\n--- Indexing Documents ---")
|
| 372 |
-
for pdf_path in pdf_document_paths:
|
| 373 |
-
if os.path.exists(pdf_path):
|
| 374 |
-
rag_system.add_document(pdf_path)
|
| 375 |
-
else:
|
| 376 |
-
print(f"Error: PDF file not found at {pdf_path}. Please check the path.")
|
| 377 |
-
|
| 378 |
-
print("\n--- Chat With CompassIA (Type 'quit' to exit) ---")
|
| 379 |
-
while True:
|
| 380 |
-
user_question = input("\nHow can I help you? ")
|
| 381 |
-
if user_question.lower() == 'quit':
|
| 382 |
-
print("Exiting chat.")
|
| 383 |
-
break
|
| 384 |
-
|
| 385 |
-
rag_system.answer_question(user_question, pdf_document_paths)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
documents/Ogrenci_Liderligi_Burs_Programi_Sozlesme_Metni_2024-2025.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fd1dc09026b3212985f74ff6c9f322f9c40883039911b034c84a879ddb1531bd
|
| 3 |
-
size 177096
|
|
|
|
|
|
|
|
|
|
|
|
documents/heracles_en.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:010a6ea71343b7d41f5b0a907f7487448e72c5dad1d37ab333e0b936d4bf5c4a
|
| 3 |
-
size 581483
|
|
|
|
|
|
|
|
|
|
|
|
documents/heracles_tr.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e9879afb2856c8f8c5177ea1470607f5b4a45b49ffa6fd241b24e657a2f8e7a8
|
| 3 |
-
size 564377
|
|
|
|
|
|
|
|
|
|
|
|
documents/ogrenci_katki_payi_ogrenim_ucretleri.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:eb0d72f118f8cf4c8bf8c3212ced62de48c3808b4325ce473cbd1c4418594e8d
|
| 3 |
-
size 549275
|
|
|
|
|
|
|
|
|
|
|
|
documents/tmv-bursluluk-yonergesi.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:aa5e97a0f06393484000d42b9258ca68ecf823b777661064c44683dc7602963d
|
| 3 |
-
size 337112
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -10,9 +10,11 @@ faiss-cpu
|
|
| 10 |
chromadb
|
| 11 |
fastapi
|
| 12 |
uvicorn # For serving the FastAPI application
|
| 13 |
-
pysqlite3-binary
|
| 14 |
|
| 15 |
# System dependencies for Tesseract and Poppler on Linux
|
| 16 |
# Hugging Face Spaces uses apt-get for these
|
| 17 |
#apt_packages = python3-dev libtesseract-dev libleptonica-dev poppler-utils
|
| 18 |
|
|
|
|
|
|
|
|
|
| 10 |
chromadb
|
| 11 |
fastapi
|
| 12 |
uvicorn # For serving the FastAPI application
|
| 13 |
+
#pysqlite3-binary
|
| 14 |
|
| 15 |
# System dependencies for Tesseract and Poppler on Linux
|
| 16 |
# Hugging Face Spaces uses apt-get for these
|
| 17 |
#apt_packages = python3-dev libtesseract-dev libleptonica-dev poppler-utils
|
| 18 |
|
| 19 |
+
firebase-admin
|
| 20 |
+
firebase
|
src/compassia.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
# IMPORTANT: These lines MUST be at the very top of compassia.py
|
| 3 |
+
# They ensure that any subsequent import of 'sqlite3' (even indirectly by chromadb)
|
| 4 |
+
# will use the version provided by pysqlite3-binary.
|
| 5 |
+
"""
|
| 6 |
+
try:
|
| 7 |
+
import pysqlite3
|
| 8 |
+
sys.modules['sqlite3'] = pysqlite3
|
| 9 |
+
except ImportError:
|
| 10 |
+
# This should not happen in the Docker environment as pysqlite3-binary is in requirements.txt
|
| 11 |
+
print("Warning: pysqlite3 not found. Falling back to default sqlite3. ChromaDB might fail if it's too old.")
|
| 12 |
+
pass
|
| 13 |
+
"""
|
| 14 |
+
import requests
|
| 15 |
+
import os
|
| 16 |
+
import uuid
|
| 17 |
+
import json
|
| 18 |
+
import base64 # For decoding Firebase config
|
| 19 |
+
import hashlib # For hashing URLs for chunk IDs
|
| 20 |
+
import urllib.parse # For parsing URLs
|
| 21 |
+
import io # Import the io module for BytesIO
|
| 22 |
+
|
| 23 |
+
# Firebase Admin SDK for Firestore
|
| 24 |
+
import firebase_admin
|
| 25 |
+
from firebase_admin import credentials, firestore
|
| 26 |
+
|
| 27 |
+
# For embeddings and vector search
|
| 28 |
+
from FlagEmbedding import BGEM3FlagModel
|
| 29 |
+
import chromadb
|
| 30 |
+
|
| 31 |
+
# Import configurations and prompt from local modules
|
| 32 |
+
# These imports assume 'src' directory is on the Python path or script is run from 'src'
|
| 33 |
+
from config import (
|
| 34 |
+
DEEPSEEK_API_URL, DEEPSEEK_HEADERS,
|
| 35 |
+
EMBEDDING_MODEL_NAME, EMBEDDING_MODEL_USE_FP16,
|
| 36 |
+
CHROMADB_PERSIST_DIRECTORY, CHROMADB_COLLECTION_NAME,
|
| 37 |
+
CHUNK_SIZE, CHUNK_OVERLAP,
|
| 38 |
+
LLM_TEMPERATURE, LLM_MAX_TOKENS, LLM_HISTORY_MAX_TOKENS,
|
| 39 |
+
FIREBASE_CONFIG_BASE64
|
| 40 |
+
)
|
| 41 |
+
from pdf_processing import extract_text_from_pdf, chunk_text # Import functions from pdf_processing
|
| 42 |
+
from prompt import SYSTEM_PROMPT # Import the system prompt
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# --- Global Firebase Firestore Client ---
|
| 46 |
+
FIRESTORE_DATABASE = None
|
| 47 |
+
|
| 48 |
+
def initialize_firebase_client():
|
| 49 |
+
"""Initializes Firebase Admin SDK and returns the Firestore client."""
|
| 50 |
+
global FIRESTORE_DATABASE
|
| 51 |
+
# Check if Firebase Admin SDK is already initialized
|
| 52 |
+
if not firebase_admin._apps:
|
| 53 |
+
# Only attempt to initialize if FIREBASE_CONFIG_BASE64 is provided
|
| 54 |
+
if FIREBASE_CONFIG_BASE64:
|
| 55 |
+
try:
|
| 56 |
+
cred_json = base64.b64decode(FIREBASE_CONFIG_BASE64).decode('utf-8')
|
| 57 |
+
cred_dict = json.loads(cred_json)
|
| 58 |
+
cred = credentials.Certificate(cred_dict)
|
| 59 |
+
firebase_admin.initialize_app(cred)
|
| 60 |
+
print("Firebase Admin SDK initialized successfully.")
|
| 61 |
+
FIRESTORE_DATABASE = firestore.client()
|
| 62 |
+
print("Firestore client initialized successfully.")
|
| 63 |
+
return FIRESTORE_DATABASE
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"Error initializing Firebase Admin SDK: {e}")
|
| 66 |
+
print("Please ensure FIREBASE_CONFIG_BASE64 is correctly set and is a valid Base64-encoded Service Account JSON.")
|
| 67 |
+
FIRESTORE_DATABASE = None
|
| 68 |
+
return None
|
| 69 |
+
else:
|
| 70 |
+
print("Warning: FIREBASE_CONFIG_BASE64 environment variable not found. Firestore will not be available.")
|
| 71 |
+
FIRESTORE_DATABASE = None
|
| 72 |
+
return None
|
| 73 |
+
else: # Already initialized
|
| 74 |
+
print("Firebase Admin SDK already initialized.")
|
| 75 |
+
FIRESTORE_DATABASE = firestore.client() # Ensure global variable is set if already initialized
|
| 76 |
+
return FIRESTORE_DATABASE
|
| 77 |
+
|
| 78 |
+
# --- Embedding Model Initialization ---
|
| 79 |
+
print("Loading FlagEmbedding (BGE-M3) model...")
|
| 80 |
+
try:
|
| 81 |
+
embedding_model = BGEM3FlagModel(EMBEDDING_MODEL_NAME, use_fp16=EMBEDDING_MODEL_USE_FP16)
|
| 82 |
+
print("FlagEmbedding (BGE-M3) model loaded successfully.")
|
| 83 |
+
except Exception as e:
|
| 84 |
+
print(f"Error loading FlagEmbedding model: {e}")
|
| 85 |
+
print("Ensure disk space and memory are sufficient for model download.")
|
| 86 |
+
print("You might need to adjust 'use_fp16' based on your hardware (e.g., False for CPU/older GPUs).")
|
| 87 |
+
exit(1)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class DocumentRAG:
|
| 91 |
+
def __init__(self, embedding_model, persist_directory=CHROMADB_PERSIST_DIRECTORY, collection_name=CHROMADB_COLLECTION_NAME):
|
| 92 |
+
self.embedding_model = embedding_model
|
| 93 |
+
self.persist_directory = persist_directory
|
| 94 |
+
self.collection_name = collection_name
|
| 95 |
+
self.chunk_size = CHUNK_SIZE
|
| 96 |
+
self.overlap = CHUNK_OVERLAP
|
| 97 |
+
|
| 98 |
+
print(f"Initializing ChromaDB at: {self.persist_directory}")
|
| 99 |
+
self.client = chromadb.PersistentClient(path=self.persist_directory)
|
| 100 |
+
|
| 101 |
+
self.collection = self.client.get_or_create_collection(
|
| 102 |
+
name=self.collection_name,
|
| 103 |
+
metadata={"hnsw:space": "cosine"}
|
| 104 |
+
)
|
| 105 |
+
print(f"ChromaDB collection '{self.collection_name}' ready. Total chunks: {self.collection.count()}")
|
| 106 |
+
|
| 107 |
+
def _generate_chunk_id(self, pdf_url: str, chunk_idx: int) -> str:
|
| 108 |
+
"""Generates a unique ID for each chunk based on PDF URL and index."""
|
| 109 |
+
import hashlib
|
| 110 |
+
url_hash = hashlib.sha256(pdf_url.encode()).hexdigest()[:10]
|
| 111 |
+
return f"{url_hash}_{chunk_idx}_{uuid.uuid4().hex}"
|
| 112 |
+
|
| 113 |
+
def add_document(self, pdf_url: str, document_name: str = None):
|
| 114 |
+
"""
|
| 115 |
+
Adds a PDF document to the RAG system, processing and indexing its content.
|
| 116 |
+
Downloads the PDF from the URL.
|
| 117 |
+
"""
|
| 118 |
+
# Determine display name from parsed URL path if not provided
|
| 119 |
+
parsed_url_path = urllib.parse.urlparse(pdf_url).path
|
| 120 |
+
display_name = document_name if document_name else os.path.basename(parsed_url_path)
|
| 121 |
+
print(f"Adding document from URL: {pdf_url} (Display Name: {display_name})")
|
| 122 |
+
|
| 123 |
+
results = self.collection.get(
|
| 124 |
+
where={"source": pdf_url},
|
| 125 |
+
limit=1
|
| 126 |
+
)
|
| 127 |
+
if results and results['ids']:
|
| 128 |
+
print(f" Document '{display_name}' (from {pdf_url}) already in ChromaDB. Skipping re-indexing.")
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
response = requests.get(pdf_url, stream=True)
|
| 133 |
+
print(f" DEBUG: HTTP Status Code for {pdf_url}: {response.status_code}") # NEW DEBUG PRINT
|
| 134 |
+
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
| 135 |
+
|
| 136 |
+
pdf_data = io.BytesIO(response.content)
|
| 137 |
+
print(f" DEBUG: BytesIO content length for {pdf_url}: {pdf_data.getbuffer().nbytes} bytes") # NEW DEBUG PRINT
|
| 138 |
+
|
| 139 |
+
if pdf_data.getbuffer().nbytes == 0:
|
| 140 |
+
raise ValueError("Downloaded PDF content is empty.") # Raise error if content is empty
|
| 141 |
+
|
| 142 |
+
temp_pdf_path = f"/tmp/{uuid.uuid4().hex}.pdf"
|
| 143 |
+
# It's better to ensure the directory exists, although /tmp usually does
|
| 144 |
+
os.makedirs(os.path.dirname(temp_pdf_path), exist_ok=True) # Ensure /tmp directory exists
|
| 145 |
+
|
| 146 |
+
with open(temp_pdf_path, 'wb') as f:
|
| 147 |
+
f.write(pdf_data.getvalue())
|
| 148 |
+
print(f" DEBUG: Temporary PDF saved to: {temp_pdf_path}") # NEW DEBUG PRINT
|
| 149 |
+
|
| 150 |
+
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
| 151 |
+
os.remove(temp_pdf_path) # Clean up the temporary file after extraction
|
| 152 |
+
|
| 153 |
+
except requests.exceptions.RequestException as e:
|
| 154 |
+
print(f"Error downloading PDF from {pdf_url}: {e}")
|
| 155 |
+
return
|
| 156 |
+
except ValueError as e: # Catch the new ValueError for empty content
|
| 157 |
+
print(f"Error processing downloaded PDF {pdf_url}: {e}")
|
| 158 |
+
return
|
| 159 |
+
except Exception as e: # Catch any other unexpected errors during file ops or extraction
|
| 160 |
+
print(f"Error processing downloaded PDF {pdf_url}: {e}")
|
| 161 |
+
return
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
if not extracted_text:
|
| 165 |
+
print(f"Warning: No text extracted from {display_name} ({pdf_url}). Skipping.")
|
| 166 |
+
return
|
| 167 |
+
|
| 168 |
+
chunks = chunk_text(extracted_text, self.chunk_size, self.overlap)
|
| 169 |
+
if not chunks:
|
| 170 |
+
print(f"Warning: No chunks generated for {display_name} ({pdf_url}). Skipping.")
|
| 171 |
+
return
|
| 172 |
+
|
| 173 |
+
documents_to_add = []
|
| 174 |
+
metadatas_to_add = []
|
| 175 |
+
ids_to_add = []
|
| 176 |
+
|
| 177 |
+
print(f" Generating embeddings for {len(chunks)} chunks and preparing for ChromaDB: {display_name}...")
|
| 178 |
+
|
| 179 |
+
encoded_results = self.embedding_model.encode(
|
| 180 |
+
chunks,
|
| 181 |
+
batch_size=32,
|
| 182 |
+
return_dense=True,
|
| 183 |
+
return_sparse=False,
|
| 184 |
+
return_colbert_vecs=False
|
| 185 |
+
)
|
| 186 |
+
chunk_embeddings = encoded_results["dense_vecs"]
|
| 187 |
+
|
| 188 |
+
for i, chunk in enumerate(chunks):
|
| 189 |
+
unique_id = self._generate_chunk_id(pdf_url, i)
|
| 190 |
+
documents_to_add.append(chunk)
|
| 191 |
+
metadatas_to_add.append({"source": pdf_url, "display_name": display_name, "chunk_id": i})
|
| 192 |
+
ids_to_add.append(unique_id)
|
| 193 |
+
|
| 194 |
+
self.collection.add(
|
| 195 |
+
documents=documents_to_add,
|
| 196 |
+
embeddings=chunk_embeddings.tolist(),
|
| 197 |
+
metadatas=metadatas_to_add,
|
| 198 |
+
ids=ids_to_add
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
print(f" {len(documents_to_add)} chunks from '{display_name}' added to ChromaDB.")
|
| 202 |
+
print(f" Total chunks in collection: {self.collection.count()}")
|
| 203 |
+
|
| 204 |
+
def retrieve_context(self, query: str, top_k: int = 3) -> list[dict]:
|
| 205 |
+
"""
|
| 206 |
+
Retrieves top_k most relevant document chunks for a given query from ChromaDB.
|
| 207 |
+
Returns a list of dictionaries, each containing 'text' and 'source' (URL or display name).
|
| 208 |
+
"""
|
| 209 |
+
if self.collection.count() == 0:
|
| 210 |
+
print("Error: No documents indexed in ChromaDB. Cannot retrieve context.")
|
| 211 |
+
return []
|
| 212 |
+
|
| 213 |
+
print(f"Retrieving context for query: '{query}'")
|
| 214 |
+
|
| 215 |
+
query_embedding_result = self.embedding_model.encode(
|
| 216 |
+
[query],
|
| 217 |
+
batch_size=1,
|
| 218 |
+
return_dense=True,
|
| 219 |
+
return_sparse=False,
|
| 220 |
+
return_colbert_vecs=False
|
| 221 |
+
)
|
| 222 |
+
query_embedding = query_embedding_result["dense_vecs"].tolist()
|
| 223 |
+
|
| 224 |
+
results = self.collection.query(
|
| 225 |
+
query_embeddings=query_embedding,
|
| 226 |
+
n_results=top_k,
|
| 227 |
+
include=['documents', 'distances', 'metadatas']
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
retrieved_chunks_info = []
|
| 231 |
+
if results and results['documents']:
|
| 232 |
+
for i, doc_text in enumerate(results['documents'][0]):
|
| 233 |
+
source_url = results['metadatas'][0][i].get('source', 'Unknown URL')
|
| 234 |
+
display_name = results['metadatas'][0][i].get('display_name', os.path.basename(urllib.parse.urlparse(source_url).path))
|
| 235 |
+
chunk_id_info = results['metadatas'][0][i].get('chunk_id', 'N/A')
|
| 236 |
+
distance_info = results['distances'][0][i]
|
| 237 |
+
|
| 238 |
+
retrieved_chunks_info.append({
|
| 239 |
+
"text": doc_text,
|
| 240 |
+
"source_url": source_url,
|
| 241 |
+
"display_name": display_name
|
| 242 |
+
})
|
| 243 |
+
print(f" Retrieved chunk {i+1} (distance: {distance_info:.4f}) from '{display_name}' (chunk {chunk_id_info}).")
|
| 244 |
+
else:
|
| 245 |
+
print(" No relevant chunks found in ChromaDB.")
|
| 246 |
+
|
| 247 |
+
return retrieved_chunks_info
|
| 248 |
+
|
| 249 |
+
def get_conversation_history(self, conversation_id: str) -> list[dict]:
|
| 250 |
+
"""Loads chat history from Firestore for a given conversation ID."""
|
| 251 |
+
if FIRESTORE_DATABASE is None:
|
| 252 |
+
print("Firestore not initialized. Cannot load conversation history.")
|
| 253 |
+
return []
|
| 254 |
+
|
| 255 |
+
doc_ref = FIRESTORE_DATABASE.collection('conversations').document(conversation_id)
|
| 256 |
+
doc = doc_ref.get()
|
| 257 |
+
if doc.exists:
|
| 258 |
+
history = doc.to_dict().get('messages', [])
|
| 259 |
+
print(f"Loaded history for {conversation_id}: {len(history)} messages.")
|
| 260 |
+
return history
|
| 261 |
+
print(f"No history found for conversation ID: {conversation_id}")
|
| 262 |
+
return []
|
| 263 |
+
|
| 264 |
+
def save_conversation_history(self, conversation_id: str, history: list[dict]):
|
| 265 |
+
"""Saves chat history to Firestore for a given conversation ID."""
|
| 266 |
+
if FIRESTORE_DATABASE is None:
|
| 267 |
+
print("Firestore not initialized. Cannot save conversation history.")
|
| 268 |
+
return
|
| 269 |
+
|
| 270 |
+
doc_ref = FIRESTORE_DATABASE.collection('conversations').document(conversation_id)
|
| 271 |
+
doc_ref.set({'messages': history})
|
| 272 |
+
print(f"Saved history for {conversation_id}: {len(history)} messages.")
|
| 273 |
+
|
| 274 |
+
def truncate_history(self, messages: list[dict], max_tokens: int = LLM_HISTORY_MAX_TOKENS) -> list[dict]:
|
| 275 |
+
"""
|
| 276 |
+
Truncates conversation history to fit within a max_tokens limit for the LLM.
|
| 277 |
+
This is a simplistic truncation and doesn't use a tokenizer for exact token count.
|
| 278 |
+
"""
|
| 279 |
+
current_len = sum(len(m['content']) for m in messages)
|
| 280 |
+
while current_len > max_tokens and len(messages) > 1: # Keep at least 1 message
|
| 281 |
+
if messages[0]['role'] == 'system':
|
| 282 |
+
if len(messages) >= 3:
|
| 283 |
+
removed_user_msg = messages.pop(1)
|
| 284 |
+
removed_ai_msg = messages.pop(1)
|
| 285 |
+
current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
|
| 286 |
+
else:
|
| 287 |
+
break
|
| 288 |
+
else:
|
| 289 |
+
removed_user_msg = messages.pop(0)
|
| 290 |
+
removed_ai_msg = messages.pop(0)
|
| 291 |
+
current_len -= (len(removed_user_msg['content']) + len(removed_ai_msg['content']))
|
| 292 |
+
return messages
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def answer_question(self, question: str, conversation_id: str = None) -> str:
|
| 296 |
+
"""
|
| 297 |
+
Answers a question by retrieving context, and querying DeepSeek.
|
| 298 |
+
Now includes conversational memory and uses global configs.
|
| 299 |
+
"""
|
| 300 |
+
# Note: Document indexing is handled at FastAPI app startup for persistence.
|
| 301 |
+
|
| 302 |
+
# Get relevant context from ChromaDB
|
| 303 |
+
context_chunks_info = self.retrieve_context(question)
|
| 304 |
+
|
| 305 |
+
context_parts = []
|
| 306 |
+
citation_info = {}
|
| 307 |
+
|
| 308 |
+
for chunk_info in context_chunks_info:
|
| 309 |
+
context_parts.append(chunk_info["text"])
|
| 310 |
+
source_key = chunk_info.get("display_name", chunk_info["source_url"])
|
| 311 |
+
if source_key not in citation_info:
|
| 312 |
+
citation_info[source_key] = True
|
| 313 |
+
|
| 314 |
+
context = "\n\n".join(context_parts)
|
| 315 |
+
|
| 316 |
+
context_prompt = ""
|
| 317 |
+
if context:
|
| 318 |
+
context_prompt = f"Using the following context:\n\n{context}\n\n"
|
| 319 |
+
else:
|
| 320 |
+
print("Warning: No relevant context found. Answering based on general knowledge or indicating lack of information.")
|
| 321 |
+
|
| 322 |
+
# --- Handle Conversational Memory ---
|
| 323 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 324 |
+
|
| 325 |
+
if conversation_id:
|
| 326 |
+
history = self.get_conversation_history(conversation_id)
|
| 327 |
+
if history:
|
| 328 |
+
messages.extend(history)
|
| 329 |
+
|
| 330 |
+
# Add current context and question
|
| 331 |
+
messages.append({"role": "user", "content": f"{context_prompt}Question: {question}"})
|
| 332 |
+
|
| 333 |
+
# Truncate conversation history if it's too long
|
| 334 |
+
messages = self.truncate_history(messages)
|
| 335 |
+
|
| 336 |
+
# Call DeepSeek API via OpenRouter
|
| 337 |
+
print("\nSending request to DeepSeek API...")
|
| 338 |
+
data = {
|
| 339 |
+
"model": "deepseek/deepseek-chat:free",
|
| 340 |
+
"messages": messages,
|
| 341 |
+
"temperature": LLM_TEMPERATURE,
|
| 342 |
+
"max_tokens": LLM_MAX_TOKENS,
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
response = requests.post(DEEPSEEK_API_URL, json=data, headers=DEEPSEEK_HEADERS)
|
| 346 |
+
|
| 347 |
+
if response.status_code == 200:
|
| 348 |
+
ai_response = response.json()
|
| 349 |
+
answer = ai_response['choices'][0]['message']['content']
|
| 350 |
+
print("\nDeepSeek Response:")
|
| 351 |
+
print(answer)
|
| 352 |
+
|
| 353 |
+
if citation_info:
|
| 354 |
+
citations_str = "\n\n**Sources:**\n" + "\n".join([f"- {name}" for name in citation_info.keys()])
|
| 355 |
+
answer += citations_str
|
| 356 |
+
|
| 357 |
+
if conversation_id:
|
| 358 |
+
messages.append({"role": "assistant", "content": answer})
|
| 359 |
+
self.save_conversation_history(conversation_id, messages)
|
| 360 |
+
|
| 361 |
+
return answer
|
| 362 |
+
else:
|
| 363 |
+
error_message = f"Failed to fetch data from DeepSeek API. Status Code: {response.status_code}. Response: {response.text}"
|
| 364 |
+
print(error_message)
|
| 365 |
+
return f"Error: Could not get an answer from the AI. Details: {error_message}"
|
| 366 |
+
|
| 367 |
+
# --- Main execution logic for local testing ---
|
| 368 |
+
if __name__ == "__main__":
|
| 369 |
+
from dotenv import load_dotenv # Import load_dotenv for local execution
|
| 370 |
+
# Load environment variables from .env.local in the project root
|
| 371 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env.local'))
|
| 372 |
+
|
| 373 |
+
# Retrieve FIREBASE_CONFIG_BASE64 after loading dotenv
|
| 374 |
+
from config import FIREBASE_CONFIG_BASE64
|
| 375 |
+
|
| 376 |
+
initialize_firebase_client()
|
| 377 |
+
|
| 378 |
+
rag_system = DocumentRAG(
|
| 379 |
+
embedding_model=embedding_model,
|
| 380 |
+
persist_directory=CHROMADB_PERSIST_DIRECTORY,
|
| 381 |
+
collection_name=CHROMADB_COLLECTION_NAME
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
print("\n--- Indexing Documents ---")
|
| 385 |
+
if FIRESTORE_DATABASE:
|
| 386 |
+
try:
|
| 387 |
+
docs_ref = FIRESTORE_DATABASE.collection('documents').stream()
|
| 388 |
+
firestore_pdf_infos = []
|
| 389 |
+
documents_processed_count = 0 # Track total documents found in Firestore
|
| 390 |
+
documents_skipped_non_pdf_count = 0 # Track documents skipped due to non-PDF URL
|
| 391 |
+
|
| 392 |
+
for doc in docs_ref:
|
| 393 |
+
documents_processed_count += 1
|
| 394 |
+
doc_data = doc.to_dict()
|
| 395 |
+
print(f" DEBUG: Processing document ID: {doc.id}, Data: {doc_data}")
|
| 396 |
+
|
| 397 |
+
if 'fileUrl' in doc_data:
|
| 398 |
+
pdf_url = doc_data['fileUrl']
|
| 399 |
+
print(f" DEBUG: Found 'fileUrl': {pdf_url}")
|
| 400 |
+
|
| 401 |
+
# Parse the URL to get the path part (without query parameters)
|
| 402 |
+
parsed_url = urllib.parse.urlparse(pdf_url)
|
| 403 |
+
file_path = parsed_url.path
|
| 404 |
+
|
| 405 |
+
# Extract the filename from the path and normalize it
|
| 406 |
+
file_name = os.path.basename(file_path)
|
| 407 |
+
|
| 408 |
+
# DEBUGGING: Print the extracted file_name and the result of the check
|
| 409 |
+
print(f" DEBUG: Extracted file_name: '{file_name}'")
|
| 410 |
+
is_pdf_check = isinstance(file_name, str) and file_name.strip().lower().endswith('.pdf')
|
| 411 |
+
print(f" DEBUG: is_pdf_check result: {is_pdf_check}")
|
| 412 |
+
|
| 413 |
+
if is_pdf_check:
|
| 414 |
+
display_name = doc_data.get('name_en', file_name) # Use file_name if name_en is missing
|
| 415 |
+
firestore_pdf_infos.append({"url": pdf_url, "name": display_name})
|
| 416 |
+
print(f"Found PDF in Firestore: {display_name} ({pdf_url}) - Qualified for indexing.")
|
| 417 |
+
else:
|
| 418 |
+
documents_skipped_non_pdf_count += 1
|
| 419 |
+
# Corrected debug print for non-PDFs to show the file_name being evaluated
|
| 420 |
+
print(f" DEBUG: Skipped: '{file_name}' (Type: {type(file_name)}) does not end with '.pdf' (case-insensitive, stripped). Original URL: '{pdf_url}'")
|
| 421 |
+
else:
|
| 422 |
+
documents_skipped_non_pdf_count += 1
|
| 423 |
+
print(f" DEBUG: Document ID: {doc.id} does not contain 'fileUrl'. Document data: {doc_data}")
|
| 424 |
+
|
| 425 |
+
if documents_processed_count == 0:
|
| 426 |
+
print("No documents found in Firestore collection 'documents' via stream(). Please check collection name and security rules.")
|
| 427 |
+
elif documents_processed_count > 0 and not firestore_pdf_infos:
|
| 428 |
+
print(f"Found {documents_processed_count} documents in Firestore, but none matched the '.pdf' criteria (all {documents_skipped_non_pdf_count} documents skipped).")
|
| 429 |
+
elif documents_skipped_non_pdf_count > 0:
|
| 430 |
+
print(f"Found {documents_processed_count} documents in Firestore. {len(firestore_pdf_infos)} PDFs qualified, {documents_skipped_non_pdf_count} documents skipped (non-PDF or missing fileUrl).")
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
if firestore_pdf_infos:
|
| 434 |
+
for pdf_info in firestore_pdf_infos:
|
| 435 |
+
rag_system.add_document(pdf_info['url'], pdf_info['name'])
|
| 436 |
+
else:
|
| 437 |
+
pass # Specific messages already printed above.
|
| 438 |
+
|
| 439 |
+
except Exception as e:
|
| 440 |
+
print(f"Error fetching documents from Firestore: {e}")
|
| 441 |
+
print("Please ensure your Firestore database is accessible and the service account key is correct.")
|
| 442 |
+
else:
|
| 443 |
+
print("Firestore client not initialized. Cannot fetch documents from Firestore.")
|
| 444 |
+
print("Using local PDF_DOCUMENT_PATHS as a fallback for testing purposes (ensure these files exist).")
|
| 445 |
+
# This import is moved here to avoid circular dependency if config imports rag_system
|
| 446 |
+
from .config import PDF_DOCUMENT_PATHS
|
| 447 |
+
for pdf_path in PDF_DOCUMENT_PATHS:
|
| 448 |
+
if os.path.exists(pdf_path):
|
| 449 |
+
rag_system.add_document(pdf_path)
|
| 450 |
+
else:
|
| 451 |
+
print(f"Error: Local PDF file not found at {pdf_path}. Skipping.")
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
print("\n--- Chat With CompassIA (Type 'q' to exit) ---")
|
| 455 |
+
current_conversation_id = str(uuid.uuid4())
|
| 456 |
+
print(f"Starting new local conversation with ID: {current_conversation_id}")
|
| 457 |
+
|
| 458 |
+
while True:
|
| 459 |
+
user_question = input("\nHow can I help you? ")
|
| 460 |
+
if user_question.lower() == 'q':
|
| 461 |
+
print("Exiting chat.")
|
| 462 |
+
break
|
| 463 |
+
|
| 464 |
+
rag_system.answer_question(user_question, conversation_id=current_conversation_id)
|
src/config.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# --- OpenRouter DeepSeek API Configuration ---
|
| 4 |
+
# Your DeepSeek API key, fetched from environment variables.
|
| 5 |
+
# This should be set as a secret on Hugging Face Spaces.
|
| 6 |
+
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_R1_V3_API_KEY")
|
| 7 |
+
if DEEPSEEK_API_KEY:
|
| 8 |
+
DEEPSEEK_API_KEY = DEEPSEEK_API_KEY.strip()
|
| 9 |
+
|
| 10 |
+
# Base URL for the OpenRouter API.
|
| 11 |
+
DEEPSEEK_API_URL = 'https://openrouter.ai/api/v1/chat/completions'
|
| 12 |
+
|
| 13 |
+
# Headers required for OpenRouter API authentication.
|
| 14 |
+
DEEPSEEK_HEADERS = {
|
| 15 |
+
'Authorization': f'Bearer {DEEPSEEK_API_KEY}',
|
| 16 |
+
'Content-Type': 'application/json'
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
# --- Embedding Model Configuration ---
|
| 20 |
+
# Name of the Hugging Face model for embeddings.
|
| 21 |
+
EMBEDDING_MODEL_NAME = 'BAAI/bge-m3'
|
| 22 |
+
# Use float16 for reduced memory usage if supported by hardware (e.g., GPU).
|
| 23 |
+
# Set to False if encountering issues on CPU.
|
| 24 |
+
EMBEDDING_MODEL_USE_FP16 = True
|
| 25 |
+
|
| 26 |
+
# --- ChromaDB Configuration ---
|
| 27 |
+
# Directory where ChromaDB will persist its database files.
|
| 28 |
+
# This should be relative to your application's working directory.
|
| 29 |
+
CHROMADB_PERSIST_DIRECTORY = "./chroma_db"
|
| 30 |
+
# Name of the collection within ChromaDB where document chunks will be stored.
|
| 31 |
+
CHROMADB_COLLECTION_NAME = "pdf_documents_collection"
|
| 32 |
+
|
| 33 |
+
# --- Document Chunking Configuration ---
|
| 34 |
+
# Maximum size of text chunks for embedding and retrieval.
|
| 35 |
+
CHUNK_SIZE = 700
|
| 36 |
+
# Overlap between consecutive chunks to maintain context.
|
| 37 |
+
CHUNK_OVERLAP = 100
|
| 38 |
+
|
| 39 |
+
# --- LLM Response Parameters ---
|
| 40 |
+
# Temperature for the DeepSeek model. Lower values make output more deterministic.
|
| 41 |
+
LLM_TEMPERATURE = 0.5
|
| 42 |
+
# Maximum number of tokens the LLM can generate in a response.
|
| 43 |
+
LLM_MAX_TOKENS = 500
|
| 44 |
+
# Max tokens for conversation history truncation (approximate, not exact token count)
|
| 45 |
+
LLM_HISTORY_MAX_TOKENS = 3000
|
| 46 |
+
|
| 47 |
+
# --- Tesseract and Poppler Configuration (Docker/Deployment Specific) ---
|
| 48 |
+
# Environment variables set in Dockerfile for Tesseract.
|
| 49 |
+
TESSDATA_PREFIX = os.getenv("TESSDATA_PREFIX", "/usr/share/tesseract-ocr/4.00/tessdata")
|
| 50 |
+
TESSERACT_CMD = os.getenv("TESSERACT_CMD", "/usr/bin/tesseract")
|
| 51 |
+
# Path to Poppler's bin directory if not in system PATH (mostly for local Windows setup).
|
| 52 |
+
POPPLER_PATH = None # e.g., r'C:\path\to\poppler\bin'
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# --- Firebase Configuration (for Conversational Memory) ---
|
| 57 |
+
# Base64 encoded JSON string of your Firebase Service Account Key.
|
| 58 |
+
# This should be set as a secret on Hugging Face Spaces.
|
| 59 |
+
FIREBASE_CONFIG_BASE64 = os.getenv("FIREBASE_CONFIG_BASE64")
|
src/pdf_processing.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import re
|
| 4 |
+
from PIL import Image
|
| 5 |
+
|
| 6 |
+
# For text extraction from PDFs (non-OCR)
|
| 7 |
+
from pdfminer.high_level import extract_text_to_fp
|
| 8 |
+
from pdfminer.layout import LAParams
|
| 9 |
+
|
| 10 |
+
# For image-based PDFs (OCR)
|
| 11 |
+
from pdf2image import convert_from_path
|
| 12 |
+
import pytesseract
|
| 13 |
+
|
| 14 |
+
# Import Tesseract configuration from config.py
|
| 15 |
+
from config import TESSDATA_PREFIX, TESSERACT_CMD, POPPLER_PATH
|
| 16 |
+
|
| 17 |
+
# Set Tesseract command explicitly (uses ENV from Dockerfile or default)
|
| 18 |
+
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD
|
| 19 |
+
if POPPLER_PATH:
|
| 20 |
+
# This setting is usually only needed for local Windows development
|
| 21 |
+
# where Poppler isn't in system PATH.
|
| 22 |
+
# In Docker, Poppler should be in PATH via apt-get install.
|
| 23 |
+
pass # No direct setting in pdf2image, but convert_from_path can accept poppler_path
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_text_from_pdf(pdf_path: str) -> str:
|
| 27 |
+
"""
|
| 28 |
+
Extracts text from a PDF. Tries direct text extraction first.
|
| 29 |
+
If sparse text is found (suggesting image-based PDF), it performs OCR.
|
| 30 |
+
"""
|
| 31 |
+
print(f"Attempting direct text extraction from: {pdf_path}")
|
| 32 |
+
output_string = io.StringIO()
|
| 33 |
+
with open(pdf_path, 'rb') as fp:
|
| 34 |
+
try:
|
| 35 |
+
extract_text_to_fp(fp, output_string, laparams=LAParams())
|
| 36 |
+
text = output_string.getvalue()
|
| 37 |
+
# If text is very short for a non-empty PDF, it might be image-based.
|
| 38 |
+
# Using a threshold of 100 characters for extracted text and file size > 10KB.
|
| 39 |
+
if len(text.strip()) < 100 and os.path.getsize(pdf_path) > 10000:
|
| 40 |
+
print("Direct extraction yielded sparse text. Attempting OCR...")
|
| 41 |
+
return ocr_pdf(pdf_path)
|
| 42 |
+
return text
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"Direct PDF text extraction failed ({e}). Attempting OCR...")
|
| 45 |
+
return ocr_pdf(pdf_path)
|
| 46 |
+
|
| 47 |
+
def ocr_pdf(pdf_path: str) -> str:
|
| 48 |
+
"""
|
| 49 |
+
Performs OCR on a PDF file using pdf2image and pytesseract.
|
| 50 |
+
Requires Tesseract and Poppler to be installed and in system PATH.
|
| 51 |
+
"""
|
| 52 |
+
all_text = []
|
| 53 |
+
try:
|
| 54 |
+
# Convert PDF pages to images. Higher DPI for better OCR.
|
| 55 |
+
# Pass poppler_path=POPPLER_PATH if it's set for local dev (Docker handles it via PATH)
|
| 56 |
+
images = convert_from_path(pdf_path, dpi=300)
|
| 57 |
+
|
| 58 |
+
print(f" Performing OCR on {len(images)} pages...")
|
| 59 |
+
for i, img in enumerate(images):
|
| 60 |
+
# Tesseract language packs: 'eng' for English, 'tur' for Turkish
|
| 61 |
+
# Dockerfile should install 'tesseract-ocr-ara' and 'tesseract-ocr-fra'
|
| 62 |
+
# if you need Arabic and French OCR.
|
| 63 |
+
page_text = pytesseract.image_to_string(img, lang='eng+tur+ara+fra') # Updated languages
|
| 64 |
+
all_text.append(page_text)
|
| 65 |
+
print(f" Page {i+1} OCR complete.")
|
| 66 |
+
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"OCR process failed: {e}")
|
| 69 |
+
print("Please ensure Tesseract OCR and Poppler are correctly installed and their executables are in your system's PATH.")
|
| 70 |
+
return ""
|
| 71 |
+
|
| 72 |
+
return "\n".join(all_text)
|
| 73 |
+
|
| 74 |
+
def chunk_text(text: str, max_chunk_size: int = 700, overlap: int = 100) -> list[str]:
|
| 75 |
+
"""
|
| 76 |
+
Splits text into chunks of a maximum size with optional overlap.
|
| 77 |
+
Aims to split by paragraphs/sentences first, then by word.
|
| 78 |
+
"""
|
| 79 |
+
if not text:
|
| 80 |
+
return []
|
| 81 |
+
|
| 82 |
+
# Simple paragraph-based chunking
|
| 83 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
| 84 |
+
chunks = []
|
| 85 |
+
current_chunk = []
|
| 86 |
+
current_chunk_len = 0
|
| 87 |
+
|
| 88 |
+
for para in paragraphs:
|
| 89 |
+
if not para.strip():
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
# If adding paragraph plus a separator exceeds max_chunk_size,
|
| 93 |
+
# or if the current_chunk is already substantial and adding this makes it too big,
|
| 94 |
+
# then finalize the current chunk.
|
| 95 |
+
if current_chunk_len + len(para) + len('\n\n') > max_chunk_size:
|
| 96 |
+
if current_chunk: # Only append if current_chunk is not empty
|
| 97 |
+
chunks.append("\n\n".join(current_chunk))
|
| 98 |
+
current_chunk = []
|
| 99 |
+
current_chunk_len = 0
|
| 100 |
+
|
| 101 |
+
# If a single paragraph is larger than max_chunk_size, split it by words
|
| 102 |
+
if len(para) > max_chunk_size:
|
| 103 |
+
words = para.split(' ')
|
| 104 |
+
sub_chunk = []
|
| 105 |
+
sub_chunk_len = 0
|
| 106 |
+
for word in words:
|
| 107 |
+
if sub_chunk_len + len(word) + len(' ') > max_chunk_size:
|
| 108 |
+
chunks.append(" ".join(sub_chunk))
|
| 109 |
+
sub_chunk = [word]
|
| 110 |
+
sub_chunk_len = len(word)
|
| 111 |
+
else:
|
| 112 |
+
sub_chunk.append(word)
|
| 113 |
+
sub_chunk_len += len(word) + len(' ')
|
| 114 |
+
if sub_chunk: # Add remaining sub-chunk
|
| 115 |
+
chunks.append(" ".join(sub_chunk))
|
| 116 |
+
else: # Paragraph fits into a new chunk
|
| 117 |
+
current_chunk.append(para)
|
| 118 |
+
current_chunk_len += len(para) + len('\n\n')
|
| 119 |
+
else: # Paragraph fits into the current chunk
|
| 120 |
+
current_chunk.append(para)
|
| 121 |
+
current_chunk_len += len(para) + len('\n\n')
|
| 122 |
+
|
| 123 |
+
if current_chunk: # Add any remaining text
|
| 124 |
+
chunks.append("\n\n".join(current_chunk))
|
| 125 |
+
|
| 126 |
+
# Apply overlap: This is a simplistic overlap implementation.
|
| 127 |
+
final_chunks_with_overlap = []
|
| 128 |
+
for i in range(len(chunks)):
|
| 129 |
+
chunk = chunks[i]
|
| 130 |
+
if i > 0 and overlap > 0:
|
| 131 |
+
# Take a portion of the previous chunk to overlap
|
| 132 |
+
prev_chunk_part = chunks[i-1][-overlap:]
|
| 133 |
+
chunk = prev_chunk_part + "\n" + chunk
|
| 134 |
+
final_chunks_with_overlap.append(chunk)
|
| 135 |
+
|
| 136 |
+
return final_chunks_with_overlap
|
src/prompt.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
SYSTEM_PROMPT = """
|
| 2 |
+
You are CompassIA, the intelligent assistant for MaarifCompass, committed to supporting Turkiye Maarif Foundation graduates residing in Turkiye.
|
| 3 |
+
|
| 4 |
+
Your core function is to deliver precise, document-backed information concerning their needs, primarily focusing on:
|
| 5 |
+
- University application procedures, requirements, tuition fees, and scholarship opportunities
|
| 6 |
+
- Accommodation and housing resources
|
| 7 |
+
- Career networking and professional development
|
| 8 |
+
- Relevant administrative and support services
|
| 9 |
+
- Information related to Turkiye Maarif Foundation, Turkiye Scholarship, and more.
|
| 10 |
+
|
| 11 |
+
You operate exclusively with data from a designated Document Center. **All information you provide must be directly sourced and verifiable from these internal documents.**
|
| 12 |
+
|
| 13 |
+
**If a query falls outside the scope of the provided documents or lacks a direct answer within them, you must state that the information is not available in your current knowledge base. Do not offer external insights, assumptions, or speculate. This is a critical constraint.**
|
| 14 |
+
|
| 15 |
+
Always cite the specific document name and page number for every piece of information provided. If information is aggregated from multiple sources, cite all relevant documents and page numbers.
|
| 16 |
+
|
| 17 |
+
Your answers must be highly accurate, directly relevant, and easy to understand. Prioritize the user's query strictly based on documented facts.
|
| 18 |
+
|
| 19 |
+
**Remember, always respond to the user in the language of their question.**
|
| 20 |
+
"""
|