Spaces:
Sleeping
Sleeping
Commit ·
e5b884f
1
Parent(s): f60655e
code push
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +17 -0
- .python-version +1 -0
- Best.md +2 -0
- Dockerfile +30 -0
- app/__init__.py +1 -0
- app/api/__init__.py +0 -0
- app/api/deps.py +0 -0
- app/api/v1/__init__.py +0 -0
- app/api/v1/routes.py +255 -0
- app/config/__init__.py +1 -0
- app/config/config.py +37 -0
- app/config/config.yaml +20 -0
- app/core/__init__.py +0 -0
- app/core/session_manager.py +198 -0
- app/data/__init__.py +0 -0
- app/database/database.py +177 -0
- app/database/sessions.db +0 -0
- app/embedding/__init__.py +1 -0
- app/embedding/embeder.py +12 -0
- app/embedding/vectore_store.py +40 -0
- app/ingestion/__init__.py +1 -0
- app/ingestion/file_loader.py +80 -0
- app/ingestion/text_splitter.py +99 -0
- app/main.py +0 -0
- app/metadata_extraction/__init__.py +0 -0
- app/metadata_extraction/metadata_ext.py +114 -0
- app/prompts/__init__.py +0 -0
- app/prompts/prompts.py +7 -0
- app/reseasoning/__init__.py +1 -0
- app/reseasoning/descision_maker.py +55 -0
- app/reseasoning/query_parser.py +18 -0
- app/retrieval/__init__.py +1 -0
- app/retrieval/reranker.py +0 -0
- app/retrieval/retriever.py +55 -0
- app/schemas/__init__.py +1 -0
- app/schemas/metadata_schema.py +75 -0
- app/schemas/request_models.py +77 -0
- app/schemas/response_models.py +32 -0
- app/services/RAG_service.py +165 -0
- app/services/__init__.py +0 -0
- app/utils/__init__.py +1 -0
- app/utils/config_loader.py +8 -0
- app/utils/document_op.py +14 -0
- app/utils/embedding_manager.py +0 -0
- app/utils/logger.py +0 -0
- app/utils/metadata_utils.py +51 -0
- app/utils/model_loader.py +83 -0
- experiments.ipynb +0 -0
- main.py +76 -0
- pyproject.toml +191 -0
.gitignore
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python-generated files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[oc]
|
| 4 |
+
build/
|
| 5 |
+
dist/
|
| 6 |
+
wheels/
|
| 7 |
+
*.egg-info
|
| 8 |
+
|
| 9 |
+
# Virtual environments
|
| 10 |
+
.venv
|
| 11 |
+
.env
|
| 12 |
+
env
|
| 13 |
+
.env/
|
| 14 |
+
hackrx_rag_app
|
| 15 |
+
hackrx_rag_app/
|
| 16 |
+
.hackrx_rag_app
|
| 17 |
+
hackrx_rag_app/
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.12
|
Best.md
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Text splitting techniques
|
| 2 |
+
1.
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Copy requirements first for better caching
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
|
| 9 |
+
# Install Python dependencies
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
# Copy the entire application
|
| 13 |
+
COPY . .
|
| 14 |
+
|
| 15 |
+
# Create necessary directories
|
| 16 |
+
RUN mkdir -p app/uploads app/data
|
| 17 |
+
|
| 18 |
+
# Set environment variables
|
| 19 |
+
ENV PYTHONPATH=/app
|
| 20 |
+
ENV PORT=7860
|
| 21 |
+
|
| 22 |
+
# Expose the port that Hugging Face Spaces expects
|
| 23 |
+
EXPOSE 7860
|
| 24 |
+
|
| 25 |
+
# Health check
|
| 26 |
+
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 27 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 28 |
+
|
| 29 |
+
# Command to run the FastAPI application
|
| 30 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/api/__init__.py
ADDED
|
File without changes
|
app/api/deps.py
ADDED
|
File without changes
|
app/api/v1/__init__.py
ADDED
|
File without changes
|
app/api/v1/routes.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import os
|
| 5 |
+
import tempfile
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from app.core.session_manager import SessionManager, Session, session_manager
|
| 8 |
+
from app.services.RAG_service import RAGService
|
| 9 |
+
from app.schemas.request_models import QueryRequest
|
| 10 |
+
from app.schemas.response_models import SessionResponse, QueryResponse, UploadResponse, SourceDocument
|
| 11 |
+
from app.config.config import get_settings
|
| 12 |
+
|
| 13 |
+
router = APIRouter()
|
| 14 |
+
|
| 15 |
+
def get_session(session_id:str) -> Session:
|
| 16 |
+
"""Dependency to get and validare session"""
|
| 17 |
+
print(f"[DEBUG] Looking for session: {session_id}")
|
| 18 |
+
print(f"[DEBUG] Available sessions: {list(session_manager.sessions.keys())}")
|
| 19 |
+
session = session_manager.get_session(session_id=session_id)
|
| 20 |
+
if not session:
|
| 21 |
+
print(f"[DEBUG] Session not found: {session_id}")
|
| 22 |
+
raise HTTPException(status_code=404, detail="Session not found or expired"
|
| 23 |
+
)
|
| 24 |
+
print(f"[DEBUG] Session found: {session_id}")
|
| 25 |
+
return session
|
| 26 |
+
|
| 27 |
+
@router.post("/session", response_model=SessionResponse)
|
| 28 |
+
async def create_session(username: str = "anonymous"):
|
| 29 |
+
"""Create a new session for document processing"""
|
| 30 |
+
session_id = session_manager.create_session(username)
|
| 31 |
+
print(f"[DEBUG] Created session: {session_id} for user: {username}")
|
| 32 |
+
print(f"[DEBUG] Total sessions now: {len(session_manager.sessions)}")
|
| 33 |
+
return SessionResponse(
|
| 34 |
+
session_id=session_id,
|
| 35 |
+
message="Session created successfully"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
@router.get("/sessions/{username}")
|
| 39 |
+
async def get_user_sessions(username: str):
|
| 40 |
+
"""Get all sessions for a user"""
|
| 41 |
+
sessions = session_manager.get_user_sessions(username)
|
| 42 |
+
return {"sessions": sessions}
|
| 43 |
+
|
| 44 |
+
@router.post("/session/{session_id}/restore")
|
| 45 |
+
async def restore_session(session_id: str):
|
| 46 |
+
"""Restore a session from database"""
|
| 47 |
+
success = session_manager.restore_session(session_id)
|
| 48 |
+
if success:
|
| 49 |
+
return {"message": "Session restored successfully"}
|
| 50 |
+
else:
|
| 51 |
+
raise HTTPException(status_code=404, detail="Session not found or inactive")
|
| 52 |
+
|
| 53 |
+
@router.delete("/session/{session_id}")
|
| 54 |
+
async def delete_session(session_id: str):
|
| 55 |
+
"""Delete a session"""
|
| 56 |
+
session_manager.delete_session(session_id)
|
| 57 |
+
return {"message": "Session deleted successfully"}
|
| 58 |
+
|
| 59 |
+
@router.post("/upload/{session_id}", response_model=UploadResponse)
|
| 60 |
+
async def upload_document(
|
| 61 |
+
session_id: str,
|
| 62 |
+
file: UploadFile = File(None),
|
| 63 |
+
url: str = Form(None),
|
| 64 |
+
doc_type: str = Form(None),
|
| 65 |
+
session: Session = Depends(get_session)
|
| 66 |
+
):
|
| 67 |
+
"""Upload and process a document from file or URL"""
|
| 68 |
+
settings = get_settings()
|
| 69 |
+
|
| 70 |
+
# Validate input - either file or URL must be provided
|
| 71 |
+
if not file and not url:
|
| 72 |
+
raise HTTPException(status_code=400, detail="Either file or URL must be provided")
|
| 73 |
+
|
| 74 |
+
if file and url:
|
| 75 |
+
raise HTTPException(status_code=400, detail="Provide either file or URL, not both")
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
# Initialize RAG service for this session
|
| 79 |
+
session.rag_service = RAGService()
|
| 80 |
+
|
| 81 |
+
document_name = ""
|
| 82 |
+
document_path = None
|
| 83 |
+
document_url = None
|
| 84 |
+
|
| 85 |
+
if file:
|
| 86 |
+
# Handle file upload
|
| 87 |
+
file_extension = Path(file.filename).suffix.lower()
|
| 88 |
+
if file_extension not in settings.allowed_file_types:
|
| 89 |
+
raise HTTPException(
|
| 90 |
+
status_code=400,
|
| 91 |
+
detail=f"File type {file_extension} not allowed. Allowed types: {settings.allowed_file_types}"
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Auto-detect document type if not provided
|
| 95 |
+
if not doc_type:
|
| 96 |
+
doc_type = "pdf" if file_extension == ".pdf" else "word"
|
| 97 |
+
|
| 98 |
+
# Validate file size
|
| 99 |
+
content = await file.read()
|
| 100 |
+
if len(content) > settings.max_file_size:
|
| 101 |
+
raise HTTPException(
|
| 102 |
+
status_code=400,
|
| 103 |
+
detail=f"File size too large. Maximum size: {settings.max_file_size} bytes"
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Create upload directory
|
| 107 |
+
upload_dir = Path(settings.upload_dir)
|
| 108 |
+
upload_dir.mkdir(exist_ok=True)
|
| 109 |
+
|
| 110 |
+
# Save file temporarily
|
| 111 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file:
|
| 112 |
+
tmp_file.write(content)
|
| 113 |
+
tmp_file_path = tmp_file.name
|
| 114 |
+
|
| 115 |
+
document_name = file.filename
|
| 116 |
+
document_path = tmp_file_path
|
| 117 |
+
|
| 118 |
+
# Load and split document from file
|
| 119 |
+
session.rag_service.load_and_split_document(
|
| 120 |
+
type=doc_type,
|
| 121 |
+
path=tmp_file_path
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
else:
|
| 125 |
+
# Handle URL upload
|
| 126 |
+
if not doc_type:
|
| 127 |
+
doc_type = "pdf" # Default for URLs
|
| 128 |
+
|
| 129 |
+
document_name = url.split('/')[-1] or "URL Document"
|
| 130 |
+
document_url = url
|
| 131 |
+
|
| 132 |
+
# Load and split document from URL
|
| 133 |
+
session.rag_service.load_and_split_document(
|
| 134 |
+
type=doc_type,
|
| 135 |
+
url=url
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# Create vector store
|
| 139 |
+
session.rag_service.create_vector_store()
|
| 140 |
+
|
| 141 |
+
# Update session state
|
| 142 |
+
session.document_uploaded = True
|
| 143 |
+
session.vector_store_created = True
|
| 144 |
+
session.document_info = {
|
| 145 |
+
"filename": document_name,
|
| 146 |
+
"type": doc_type,
|
| 147 |
+
"size": len(content) if file else 0,
|
| 148 |
+
"chunks_count": len(session.rag_service.chunks)
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
# Update session in database
|
| 152 |
+
session_manager.update_session_document(
|
| 153 |
+
session_id=session_id,
|
| 154 |
+
document_name=document_name,
|
| 155 |
+
document_type=doc_type,
|
| 156 |
+
chunks_count=len(session.rag_service.chunks),
|
| 157 |
+
pinecone_index=str(session.rag_service.index),
|
| 158 |
+
pinecone_namespace=session.rag_service.namespace,
|
| 159 |
+
document_path=document_path,
|
| 160 |
+
document_url=document_url
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# Clean up temporary file if it exists
|
| 164 |
+
if document_path:
|
| 165 |
+
try:
|
| 166 |
+
os.unlink(document_path)
|
| 167 |
+
except:
|
| 168 |
+
pass
|
| 169 |
+
|
| 170 |
+
return UploadResponse(
|
| 171 |
+
session_id=session_id,
|
| 172 |
+
filename=document_name,
|
| 173 |
+
document_type=doc_type,
|
| 174 |
+
chunks_created=len(session.rag_service.chunks),
|
| 175 |
+
message="Document uploaded and processed successfully"
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
except Exception as e:
|
| 179 |
+
# Clean up temporary file if it exists
|
| 180 |
+
if 'document_path' in locals() and document_path:
|
| 181 |
+
try:
|
| 182 |
+
os.unlink(document_path)
|
| 183 |
+
except:
|
| 184 |
+
pass
|
| 185 |
+
raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
|
| 186 |
+
|
| 187 |
+
@router.post("/query/{session_id}", response_model = QueryResponse)
|
| 188 |
+
async def query_document(
|
| 189 |
+
session_id: str,
|
| 190 |
+
query_request: QueryRequest,
|
| 191 |
+
session: Session = Depends(get_session)
|
| 192 |
+
):
|
| 193 |
+
"""Query the uploaded Document"""
|
| 194 |
+
if not session.document_uploaded or not session.vector_store_created:
|
| 195 |
+
raise HTTPException(
|
| 196 |
+
status_code= 400,
|
| 197 |
+
detail="No docuement uploaded or processed for this session"
|
| 198 |
+
)
|
| 199 |
+
try:
|
| 200 |
+
# create query embedding
|
| 201 |
+
session.rag_service.create_query_embedding(query_request.query)
|
| 202 |
+
|
| 203 |
+
# retrive relevant docs
|
| 204 |
+
session.rag_service.retrive_documents()
|
| 205 |
+
|
| 206 |
+
# generate answer
|
| 207 |
+
answer = session.rag_service.answer_query(query_request.query)
|
| 208 |
+
|
| 209 |
+
# Extract query metadata and sources for UI
|
| 210 |
+
query_metadata = getattr(session.rag_service, 'query_metadata', {})
|
| 211 |
+
|
| 212 |
+
# Extract source documents from results
|
| 213 |
+
sources = []
|
| 214 |
+
if hasattr(session.rag_service, 'result') and session.rag_service.result:
|
| 215 |
+
matches = session.rag_service.result.get('matches', [])
|
| 216 |
+
for match in matches[:3]: # Top 3 sources
|
| 217 |
+
metadata = match.get('metadata', {})
|
| 218 |
+
sources.append(SourceDocument(
|
| 219 |
+
doc_id=metadata.get('doc_id', match.get('id', '')),
|
| 220 |
+
page=metadata.get('page_no', metadata.get('page', 0)),
|
| 221 |
+
text=metadata.get('text', ''),
|
| 222 |
+
score=match.get('score', 0.0),
|
| 223 |
+
metadata=metadata
|
| 224 |
+
))
|
| 225 |
+
|
| 226 |
+
return QueryResponse(
|
| 227 |
+
session_id=session_id,
|
| 228 |
+
query=query_request.query,
|
| 229 |
+
answer=answer,
|
| 230 |
+
query_metadata=query_metadata,
|
| 231 |
+
sources=sources,
|
| 232 |
+
message="Query processed successfully"
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
|
| 237 |
+
|
| 238 |
+
@router.get("/session/{session_id}/status")
|
| 239 |
+
async def get_session_status(
|
| 240 |
+
session_id: str,
|
| 241 |
+
session: Session = Depends(get_session)
|
| 242 |
+
):
|
| 243 |
+
"""Get session status and information"""
|
| 244 |
+
return {
|
| 245 |
+
"session_id": session_id,
|
| 246 |
+
"created_at": session.created_at,
|
| 247 |
+
"last_activity": session.last_activity,
|
| 248 |
+
"document_uploaded": session.document_uploaded,
|
| 249 |
+
"vector_store_created": session.vector_store_created,
|
| 250 |
+
"document_info": session.document_info
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
|
app/config/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/config/config.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
from typing import Optional
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
load_dotenv()
|
| 6 |
+
class Settings(BaseSettings):
|
| 7 |
+
# API Settings
|
| 8 |
+
api_title: str = "RAG Document Analysis API"
|
| 9 |
+
api_version: str = "1.0.0"
|
| 10 |
+
|
| 11 |
+
# File Upload Settings
|
| 12 |
+
max_file_size: int = 50 * 1024 * 1024 # 50MB
|
| 13 |
+
allowed_file_types: list = [".pdf", ".docx", ".doc"]
|
| 14 |
+
upload_dir: str = "app/uploads"
|
| 15 |
+
|
| 16 |
+
# Session Settings
|
| 17 |
+
session_timeout_minutes: int = 60
|
| 18 |
+
|
| 19 |
+
database_path: str = os.getenv("DATABASE_PATH", "/tmp/claridoc_data/sessions.db")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# API Keys
|
| 23 |
+
gemini_api_key: Optional[str] = None
|
| 24 |
+
pinecone_api_key: Optional[str] = None
|
| 25 |
+
openai_api_key: Optional[str] = None
|
| 26 |
+
groq_api_key: Optional[str] = None
|
| 27 |
+
hf_token: Optional[str] = None
|
| 28 |
+
|
| 29 |
+
# Environment
|
| 30 |
+
environment: str = "development"
|
| 31 |
+
debug: bool = True
|
| 32 |
+
|
| 33 |
+
class Config:
|
| 34 |
+
env_file = ".env"
|
| 35 |
+
|
| 36 |
+
def get_settings() -> Settings:
|
| 37 |
+
return Settings()
|
app/config/config.yaml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
llm:
|
| 2 |
+
groq:
|
| 3 |
+
provider: "groq"
|
| 4 |
+
model_name: "openai/gpt-oss-20b"
|
| 5 |
+
gemini:
|
| 6 |
+
provider: "gemini"
|
| 7 |
+
model_name: "gemini-2.5-flash"
|
| 8 |
+
gemini_lite:
|
| 9 |
+
provider: "gemini_lite"
|
| 10 |
+
model_name: "gemini-2.5-flash-lite"
|
| 11 |
+
|
| 12 |
+
embedding_model:
|
| 13 |
+
openai:
|
| 14 |
+
provider: "openai"
|
| 15 |
+
model_name: "text-embedding-3-small"
|
| 16 |
+
huggingface:
|
| 17 |
+
provider: "huggingface"
|
| 18 |
+
model_name: "mixedbread-ai/mxbai-embed-large-v1"
|
| 19 |
+
|
| 20 |
+
|
app/core/__init__.py
ADDED
|
File without changes
|
app/core/session_manager.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
from typing import Dict, Optional
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
from app.services.RAG_service import RAGService
|
| 5 |
+
from app.database.database import SessionDatabase
|
| 6 |
+
from app.schemas.request_models import DocumentTypeSchema
|
| 7 |
+
|
| 8 |
+
class Session:
|
| 9 |
+
def __init__(self, session_id:str):
|
| 10 |
+
self.session_id = session_id
|
| 11 |
+
self.created_at = datetime.now()
|
| 12 |
+
self.last_activity = datetime.now()
|
| 13 |
+
self.rag_service : Optional[RAGService] = None
|
| 14 |
+
self.document_uploaded = False
|
| 15 |
+
self.vector_store_created = False
|
| 16 |
+
self.document_info = {}
|
| 17 |
+
self.username = None
|
| 18 |
+
|
| 19 |
+
def update_activity(self):
|
| 20 |
+
self.last_activity = datetime.now()
|
| 21 |
+
|
| 22 |
+
def is_expired(self, timeout_minutes: int = 60) -> bool:
|
| 23 |
+
return datetime.now() - self.last_activity > timedelta(minutes=timeout_minutes)
|
| 24 |
+
|
| 25 |
+
class SessionManager:
|
| 26 |
+
def __init__(self):
|
| 27 |
+
self.sessions: Dict[str, Session] = {}
|
| 28 |
+
self.db = SessionDatabase()
|
| 29 |
+
|
| 30 |
+
def create_session(self, username: str = "anonymous") -> str:
|
| 31 |
+
session_id = str(uuid.uuid4())
|
| 32 |
+
session = Session(session_id)
|
| 33 |
+
session.username = username
|
| 34 |
+
self.sessions[session_id] = session
|
| 35 |
+
|
| 36 |
+
# Save to database
|
| 37 |
+
self.db.create_session(session_id, username)
|
| 38 |
+
return session_id
|
| 39 |
+
|
| 40 |
+
def get_session(self, session_id: str) -> Optional[Session]:
|
| 41 |
+
# First check in-memory sessions
|
| 42 |
+
if session_id in self.sessions:
|
| 43 |
+
session = self.sessions[session_id]
|
| 44 |
+
if not session.is_expired():
|
| 45 |
+
session.update_activity()
|
| 46 |
+
return session
|
| 47 |
+
else:
|
| 48 |
+
del self.sessions[session_id]
|
| 49 |
+
|
| 50 |
+
# If not in memory, try to restore from database
|
| 51 |
+
db_session = self.db.get_session(session_id)
|
| 52 |
+
if db_session and db_session['is_active']:
|
| 53 |
+
# Restore session to memory
|
| 54 |
+
session = Session(session_id)
|
| 55 |
+
session.username = db_session['username']
|
| 56 |
+
session.document_uploaded = db_session['chunks_count'] > 0
|
| 57 |
+
session.vector_store_created = db_session['pinecone_index'] is not None
|
| 58 |
+
session.document_info = {
|
| 59 |
+
'filename': db_session['document_name'],
|
| 60 |
+
'type': db_session['document_type'],
|
| 61 |
+
'chunks_count': db_session['chunks_count']
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Initialize RAG service if document exists (same as restore_session)
|
| 65 |
+
if session.vector_store_created:
|
| 66 |
+
print(f"[SessionManager] Restoring RAG service for session {session_id}")
|
| 67 |
+
session.rag_service = RAGService()
|
| 68 |
+
|
| 69 |
+
# Set the basic attributes
|
| 70 |
+
session.rag_service.index = db_session['pinecone_index']
|
| 71 |
+
session.rag_service.namespace = db_session['pinecone_namespace']
|
| 72 |
+
session.rag_service.Document_path = db_session['document_path']
|
| 73 |
+
session.rag_service.url = db_session['document_url']
|
| 74 |
+
|
| 75 |
+
# Create a mock splitter with the keywords file path for restored sessions
|
| 76 |
+
from app.ingestion.text_splitter import splitting_text
|
| 77 |
+
from app.utils.metadata_utils import MetadataService
|
| 78 |
+
|
| 79 |
+
# Recreate the document type information
|
| 80 |
+
metadataservice = MetadataService()
|
| 81 |
+
mock_scheme = DocumentTypeSchema(document_types="Insurance") # Default for restored sessions
|
| 82 |
+
document_type = metadataservice.Return_document_model(mock_scheme)
|
| 83 |
+
session.rag_service.DocumentTypeScheme = mock_scheme
|
| 84 |
+
session.rag_service.Document_Type = document_type
|
| 85 |
+
|
| 86 |
+
# Create splitter instance to maintain the keywords file path
|
| 87 |
+
session.rag_service.splitter = splitting_text(documentTypeSchema=document_type, llm=session.rag_service.llm)
|
| 88 |
+
|
| 89 |
+
# Generate the expected keywords file path based on document name
|
| 90 |
+
import os
|
| 91 |
+
document_name = db_session['document_name'] or 'unknown'
|
| 92 |
+
keywords_filename = document_name.replace(".", "").replace("\\", "").replace("/", "") + ".json"
|
| 93 |
+
session.rag_service.splitter.Keywordsfile_path = os.path.join("app/data/", keywords_filename)
|
| 94 |
+
|
| 95 |
+
print(f"[SessionManager] RAG service restored with index: {session.rag_service.index}")
|
| 96 |
+
|
| 97 |
+
self.sessions[session_id] = session
|
| 98 |
+
return session
|
| 99 |
+
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
def update_session_document(self, session_id: str, document_name: str,
|
| 103 |
+
document_type: str, chunks_count: int,
|
| 104 |
+
pinecone_index: str = None, pinecone_namespace: str = None,
|
| 105 |
+
document_path: str = None, document_url: str = None):
|
| 106 |
+
"""Update session with document information"""
|
| 107 |
+
self.db.update_session(
|
| 108 |
+
session_id,
|
| 109 |
+
document_name=document_name,
|
| 110 |
+
document_type=document_type,
|
| 111 |
+
chunks_count=chunks_count,
|
| 112 |
+
pinecone_index=pinecone_index,
|
| 113 |
+
pinecone_namespace=pinecone_namespace,
|
| 114 |
+
document_path=document_path,
|
| 115 |
+
document_url=document_url
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Update in-memory session if exists
|
| 119 |
+
if session_id in self.sessions:
|
| 120 |
+
session = self.sessions[session_id]
|
| 121 |
+
session.document_uploaded = True
|
| 122 |
+
session.vector_store_created = pinecone_index is not None
|
| 123 |
+
session.document_info = {
|
| 124 |
+
'filename': document_name,
|
| 125 |
+
'type': document_type,
|
| 126 |
+
'chunks_count': chunks_count
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
def get_user_sessions(self, username: str):
|
| 130 |
+
"""Get all sessions for a user"""
|
| 131 |
+
return self.db.get_user_sessions(username)
|
| 132 |
+
|
| 133 |
+
def restore_session(self, session_id: str) -> bool:
|
| 134 |
+
"""Restore a session from database"""
|
| 135 |
+
db_session = self.db.get_session(session_id)
|
| 136 |
+
if db_session and db_session['is_active']:
|
| 137 |
+
session = Session(session_id)
|
| 138 |
+
session.username = db_session['username']
|
| 139 |
+
session.document_uploaded = db_session['chunks_count'] > 0
|
| 140 |
+
session.vector_store_created = db_session['pinecone_index'] is not None
|
| 141 |
+
session.document_info = {
|
| 142 |
+
'filename': db_session['document_name'],
|
| 143 |
+
'type': db_session['document_type'],
|
| 144 |
+
'chunks_count': db_session['chunks_count']
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
# Initialize RAG service if document exists
|
| 148 |
+
if session.vector_store_created:
|
| 149 |
+
print(f"[SessionManager] Restoring RAG service for session {session_id}")
|
| 150 |
+
session.rag_service = RAGService()
|
| 151 |
+
|
| 152 |
+
# Set the basic attributes
|
| 153 |
+
session.rag_service.index = db_session['pinecone_index']
|
| 154 |
+
session.rag_service.namespace = db_session['pinecone_namespace']
|
| 155 |
+
session.rag_service.Document_path = db_session['document_path']
|
| 156 |
+
session.rag_service.url = db_session['document_url']
|
| 157 |
+
|
| 158 |
+
# Create a mock splitter with the keywords file path for restored sessions
|
| 159 |
+
from app.ingestion.text_splitter import splitting_text
|
| 160 |
+
from app.utils.metadata_utils import MetadataService
|
| 161 |
+
|
| 162 |
+
# Recreate the document type information
|
| 163 |
+
metadataservice = MetadataService()
|
| 164 |
+
mock_scheme = DocumentTypeSchema(document_types="Insurance") # Default for restored sessions
|
| 165 |
+
document_type = metadataservice.Return_document_model(mock_scheme)
|
| 166 |
+
session.rag_service.DocumentTypeScheme = mock_scheme
|
| 167 |
+
session.rag_service.Document_Type = document_type
|
| 168 |
+
|
| 169 |
+
# Create splitter instance to maintain the keywords file path
|
| 170 |
+
session.rag_service.splitter = splitting_text(documentTypeSchema=document_type, llm=session.rag_service.llm)
|
| 171 |
+
|
| 172 |
+
# Generate the expected keywords file path based on document name
|
| 173 |
+
import os
|
| 174 |
+
document_name = db_session['document_name'] or 'unknown'
|
| 175 |
+
keywords_filename = document_name.replace(".", "").replace("\\", "").replace("/", "") + ".json"
|
| 176 |
+
session.rag_service.splitter.Keywordsfile_path = os.path.join("app/data/", keywords_filename)
|
| 177 |
+
|
| 178 |
+
print(f"[SessionManager] RAG service restored with index: {session.rag_service.index}")
|
| 179 |
+
|
| 180 |
+
self.sessions[session_id] = session
|
| 181 |
+
return True
|
| 182 |
+
return False
|
| 183 |
+
|
| 184 |
+
def delete_session(self, session_id:str):
|
| 185 |
+
if session_id in self.sessions:
|
| 186 |
+
del self.sessions[session_id]
|
| 187 |
+
self.db.deactivate_session(session_id)
|
| 188 |
+
|
| 189 |
+
def cleanup_expired_sessions(self):
|
| 190 |
+
expired_sessions = [
|
| 191 |
+
sid for sid, session in self.sessions.items()
|
| 192 |
+
if session.is_expired()
|
| 193 |
+
]
|
| 194 |
+
for sid in expired_sessions:
|
| 195 |
+
del self.sessions[sid]
|
| 196 |
+
|
| 197 |
+
session_manager = SessionManager()
|
| 198 |
+
|
app/data/__init__.py
ADDED
|
File without changes
|
app/database/database.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3
|
| 2 |
+
import json
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import List, Dict, Optional
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from app.config.config import get_settings
|
| 8 |
+
|
| 9 |
+
class SessionDatabase:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
settings = get_settings()
|
| 12 |
+
self.db_path = settings.database_path
|
| 13 |
+
|
| 14 |
+
# Ensure the directory exists and is writable
|
| 15 |
+
db_dir = Path(self.db_path).parent
|
| 16 |
+
try:
|
| 17 |
+
db_dir.mkdir(parents=True, exist_ok=True)
|
| 18 |
+
# Test if directory is writable
|
| 19 |
+
test_file = db_dir / "test_write.tmp"
|
| 20 |
+
test_file.touch()
|
| 21 |
+
test_file.unlink()
|
| 22 |
+
except Exception as e:
|
| 23 |
+
# Fallback to a different writable location
|
| 24 |
+
import tempfile
|
| 25 |
+
fallback_dir = Path(tempfile.gettempdir()) / "claridoc"
|
| 26 |
+
fallback_dir.mkdir(exist_ok=True)
|
| 27 |
+
self.db_path = str(fallback_dir / "sessions.db")
|
| 28 |
+
print(f"Database path not writable, using fallback: {self.db_path}")
|
| 29 |
+
|
| 30 |
+
self.init_db()
|
| 31 |
+
|
| 32 |
+
def init_db(self):
|
| 33 |
+
"""Initialize the database with required tables"""
|
| 34 |
+
try:
|
| 35 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 36 |
+
conn.execute("""
|
| 37 |
+
CREATE TABLE IF NOT EXISTS sessions (
|
| 38 |
+
session_id TEXT PRIMARY KEY,
|
| 39 |
+
username TEXT NOT NULL,
|
| 40 |
+
created_at TEXT NOT NULL,
|
| 41 |
+
last_accessed TEXT NOT NULL,
|
| 42 |
+
document_uploaded BOOLEAN DEFAULT FALSE,
|
| 43 |
+
vector_store_created BOOLEAN DEFAULT FALSE,
|
| 44 |
+
document_info TEXT DEFAULT '{}',
|
| 45 |
+
rag_service_data TEXT DEFAULT '{}'
|
| 46 |
+
)
|
| 47 |
+
""")
|
| 48 |
+
|
| 49 |
+
conn.execute("""
|
| 50 |
+
CREATE TABLE IF NOT EXISTS users (
|
| 51 |
+
username TEXT PRIMARY KEY,
|
| 52 |
+
created_at TEXT NOT NULL,
|
| 53 |
+
last_login TEXT NOT NULL
|
| 54 |
+
)
|
| 55 |
+
""")
|
| 56 |
+
conn.commit()
|
| 57 |
+
print(f"Database initialized successfully at: {self.db_path}")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
print(f"Failed to initialize database: {e}")
|
| 60 |
+
raise
|
| 61 |
+
|
| 62 |
+
def create_session(self, session_id: str, username: str):
|
| 63 |
+
"""Create a new session in the database"""
|
| 64 |
+
try:
|
| 65 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 66 |
+
now = datetime.now().isoformat()
|
| 67 |
+
|
| 68 |
+
# Insert or update user
|
| 69 |
+
conn.execute("""
|
| 70 |
+
INSERT OR REPLACE INTO users (username, created_at, last_login)
|
| 71 |
+
VALUES (?, COALESCE((SELECT created_at FROM users WHERE username = ?), ?), ?)
|
| 72 |
+
""", (username, username, now, now))
|
| 73 |
+
|
| 74 |
+
# Insert session
|
| 75 |
+
conn.execute("""
|
| 76 |
+
INSERT INTO sessions (session_id, username, created_at, last_accessed)
|
| 77 |
+
VALUES (?, ?, ?, ?)
|
| 78 |
+
""", (session_id, username, now, now))
|
| 79 |
+
|
| 80 |
+
conn.commit()
|
| 81 |
+
except Exception as e:
|
| 82 |
+
print(f"Failed to create session: {e}")
|
| 83 |
+
raise
|
| 84 |
+
|
| 85 |
+
def get_user_sessions(self, username: str) -> List[Dict]:
|
| 86 |
+
"""Get all sessions for a user"""
|
| 87 |
+
try:
|
| 88 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 89 |
+
conn.row_factory = sqlite3.Row
|
| 90 |
+
cursor = conn.execute("""
|
| 91 |
+
SELECT * FROM sessions
|
| 92 |
+
WHERE username = ?
|
| 93 |
+
ORDER BY last_accessed DESC
|
| 94 |
+
""", (username,))
|
| 95 |
+
|
| 96 |
+
sessions = []
|
| 97 |
+
for row in cursor.fetchall():
|
| 98 |
+
document_info = json.loads(row['document_info'] or '{}')
|
| 99 |
+
sessions.append({
|
| 100 |
+
'session_id': row['session_id'],
|
| 101 |
+
'created_at': row['created_at'],
|
| 102 |
+
'last_accessed': row['last_accessed'],
|
| 103 |
+
'document_uploaded': bool(row['document_uploaded']),
|
| 104 |
+
'vector_store_created': bool(row['vector_store_created']),
|
| 105 |
+
'document_name': document_info.get('filename', 'Untitled Document'),
|
| 106 |
+
'document_type': document_info.get('type', 'Unknown'),
|
| 107 |
+
'chunks_count': document_info.get('chunks_count', 0)
|
| 108 |
+
})
|
| 109 |
+
|
| 110 |
+
return sessions
|
| 111 |
+
except Exception as e:
|
| 112 |
+
print(f"Failed to get user sessions: {e}")
|
| 113 |
+
return []
|
| 114 |
+
|
| 115 |
+
def get_session(self, session_id: str) -> Optional[Dict]:
|
| 116 |
+
"""Get session data by ID"""
|
| 117 |
+
try:
|
| 118 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 119 |
+
conn.row_factory = sqlite3.Row
|
| 120 |
+
cursor = conn.execute("""
|
| 121 |
+
SELECT * FROM sessions WHERE session_id = ?
|
| 122 |
+
""", (session_id,))
|
| 123 |
+
|
| 124 |
+
row = cursor.fetchone()
|
| 125 |
+
if row:
|
| 126 |
+
return {
|
| 127 |
+
'session_id': row['session_id'],
|
| 128 |
+
'username': row['username'],
|
| 129 |
+
'created_at': row['created_at'],
|
| 130 |
+
'last_accessed': row['last_accessed'],
|
| 131 |
+
'document_uploaded': bool(row['document_uploaded']),
|
| 132 |
+
'vector_store_created': bool(row['vector_store_created']),
|
| 133 |
+
'document_info': json.loads(row['document_info'] or '{}'),
|
| 134 |
+
'rag_service_data': json.loads(row['rag_service_data'] or '{}')
|
| 135 |
+
}
|
| 136 |
+
return None
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"Failed to get session: {e}")
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
def update_session(self, session_id: str, **kwargs):
|
| 142 |
+
"""Update session data"""
|
| 143 |
+
try:
|
| 144 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 145 |
+
# Always update last_accessed
|
| 146 |
+
kwargs['last_accessed'] = datetime.now().isoformat()
|
| 147 |
+
|
| 148 |
+
# Convert dictionaries to JSON strings
|
| 149 |
+
if 'document_info' in kwargs:
|
| 150 |
+
kwargs['document_info'] = json.dumps(kwargs['document_info'])
|
| 151 |
+
if 'rag_service_data' in kwargs:
|
| 152 |
+
kwargs['rag_service_data'] = json.dumps(kwargs['rag_service_data'])
|
| 153 |
+
|
| 154 |
+
# Build dynamic query
|
| 155 |
+
set_clause = ", ".join(f"{key} = ?" for key in kwargs.keys())
|
| 156 |
+
values = list(kwargs.values()) + [session_id]
|
| 157 |
+
|
| 158 |
+
conn.execute(f"""
|
| 159 |
+
UPDATE sessions
|
| 160 |
+
SET {set_clause}
|
| 161 |
+
WHERE session_id = ?
|
| 162 |
+
""", values)
|
| 163 |
+
|
| 164 |
+
conn.commit()
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"Failed to update session: {e}")
|
| 167 |
+
raise
|
| 168 |
+
|
| 169 |
+
def delete_session(self, session_id: str):
|
| 170 |
+
"""Delete a session"""
|
| 171 |
+
try:
|
| 172 |
+
with sqlite3.connect(self.db_path) as conn:
|
| 173 |
+
conn.execute("DELETE FROM sessions WHERE session_id = ?", (session_id,))
|
| 174 |
+
conn.commit()
|
| 175 |
+
except Exception as e:
|
| 176 |
+
print(f"Failed to delete session: {e}")
|
| 177 |
+
raise
|
app/database/sessions.db
ADDED
|
Binary file (32.8 kB). View file
|
|
|
app/embedding/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/embedding/embeder.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from typing import Union, List
|
| 4 |
+
|
| 5 |
+
class QueryEmbedding:
|
| 6 |
+
def __init__(self, query: str, embedding_model):
|
| 7 |
+
self.query = query
|
| 8 |
+
self.embedding_model = embedding_model
|
| 9 |
+
|
| 10 |
+
def get_embedding(self):
|
| 11 |
+
e_main = self.embedding_model.embed_query(self.query)
|
| 12 |
+
return e_main
|
app/embedding/vectore_store.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from pinecone import Pinecone
|
| 4 |
+
from pinecone import ServerlessSpec
|
| 5 |
+
from langchain_pinecone import PineconeVectorStore
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from uuid import uuid4
|
| 8 |
+
|
| 9 |
+
class VectorStore:
|
| 10 |
+
def __init__(self, text_chunks, embedding_model):
|
| 11 |
+
self.text_chunks = text_chunks
|
| 12 |
+
self.current_time = datetime.now()
|
| 13 |
+
self.embedding_model = embedding_model
|
| 14 |
+
|
| 15 |
+
def create_vectorestore(self):
|
| 16 |
+
load_dotenv()
|
| 17 |
+
pinecone_key = os.getenv("PINECONE_API_KEY")
|
| 18 |
+
pc = Pinecone(api_key=pinecone_key)
|
| 19 |
+
# pc._vector_api.api_client.pool_threads = 1
|
| 20 |
+
time_string = self.current_time.strftime("%Y-%m-%d-%H-%M")
|
| 21 |
+
index_name = f"rag-project"
|
| 22 |
+
if not pc.has_index(index_name):
|
| 23 |
+
pc.create_index(
|
| 24 |
+
name = index_name,
|
| 25 |
+
dimension=1536,
|
| 26 |
+
metric="cosine",
|
| 27 |
+
spec = ServerlessSpec(cloud="aws", region="us-east-1")
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
index = pc.Index(index_name)
|
| 31 |
+
# model_loader = ModelLoader(model_provider="openai")
|
| 32 |
+
# embedding_model = model_loader.load_llm()
|
| 33 |
+
uuids = [str(uuid4()) for _ in range(len(self.text_chunks)) ]
|
| 34 |
+
vector_store = PineconeVectorStore(index = index, embedding=self.embedding_model)
|
| 35 |
+
name_space = f"hackrx-index{time_string}"
|
| 36 |
+
vector_store.add_documents(documents=self.text_chunks, ids = uuids,namespace = name_space )
|
| 37 |
+
|
| 38 |
+
return index, name_space
|
| 39 |
+
|
| 40 |
+
|
app/ingestion/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/ingestion/file_loader.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from langchain_community.document_loaders import PyMuPDFLoader, Docx2txtLoader
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
from app.schemas.request_models import DocumentTypeSchema
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
from typing import List
|
| 8 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 9 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 10 |
+
from app.schemas.request_models import DocumentTypeSchema
|
| 11 |
+
|
| 12 |
+
class FileLoader:
|
| 13 |
+
def __init__(self, llm=None):
|
| 14 |
+
self.llm = llm
|
| 15 |
+
|
| 16 |
+
def detect_document_type(self, documents: List[Document]) -> DocumentTypeSchema:
|
| 17 |
+
"""Detect the genre of document by reading first 2 page content by llm."""
|
| 18 |
+
|
| 19 |
+
document_content = " ".join([doc.page_content for doc in documents])
|
| 20 |
+
parser = PydanticOutputParser(pydantic_object=DocumentTypeSchema)
|
| 21 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 22 |
+
("system", "You are a legal/HR/financial document classifier."),
|
| 23 |
+
("human", """
|
| 24 |
+
You will be given the first 2 pages of a document.
|
| 25 |
+
Classify it into one of the following categories:
|
| 26 |
+
- HR/Employment
|
| 27 |
+
- Insurance
|
| 28 |
+
- Legal/Compliance
|
| 29 |
+
- Financial/Regulatory
|
| 30 |
+
- Healthcare
|
| 31 |
+
|
| 32 |
+
Respond strictly in JSON that matches the schema.
|
| 33 |
+
|
| 34 |
+
{format_instructions}
|
| 35 |
+
|
| 36 |
+
Document content:
|
| 37 |
+
{document_content}
|
| 38 |
+
"""),
|
| 39 |
+
])
|
| 40 |
+
chain = prompt | self.llm | parser
|
| 41 |
+
result: DocumentTypeSchema = chain.invoke({
|
| 42 |
+
"document_content": document_content,
|
| 43 |
+
"format_instructions": parser.get_format_instructions()
|
| 44 |
+
})
|
| 45 |
+
return result
|
| 46 |
+
|
| 47 |
+
def load_documents_from_url(self, url: str) -> List[Document]:
|
| 48 |
+
response = requests.get(url)
|
| 49 |
+
response.raise_for_status()
|
| 50 |
+
content_type = response.headers.get('Content-Type', '')
|
| 51 |
+
if content_type == 'application/pdf':
|
| 52 |
+
tmp_file_path = self._save_temp_file(response.content, ".pdf")
|
| 53 |
+
return self.load_pdf(tmp_file_path)
|
| 54 |
+
else:
|
| 55 |
+
raise ValueError("File type not supported, expected a PDF.")
|
| 56 |
+
|
| 57 |
+
def load_pdf(self, path: str) -> List[Document]:
|
| 58 |
+
"""Load PDF from a local path and return its content."""
|
| 59 |
+
self._validate_file_exists(path)
|
| 60 |
+
loader = PyMuPDFLoader(path)
|
| 61 |
+
return loader.load()
|
| 62 |
+
|
| 63 |
+
def load_word_document(self, path: str) -> List[Document]:
|
| 64 |
+
"""Load Word document from a local path and return its content."""
|
| 65 |
+
self._validate_file_exists(path)
|
| 66 |
+
try:
|
| 67 |
+
docx_loader = Docx2txtLoader(path)
|
| 68 |
+
return docx_loader.load()
|
| 69 |
+
except Exception as e:
|
| 70 |
+
print(e)
|
| 71 |
+
return []
|
| 72 |
+
|
| 73 |
+
def _save_temp_file(self, content: bytes, suffix: str) -> str:
|
| 74 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
|
| 75 |
+
tmp_file.write(content)
|
| 76 |
+
return tmp_file.name
|
| 77 |
+
|
| 78 |
+
def _validate_file_exists(self, path: str):
|
| 79 |
+
if not os.path.exists(path):
|
| 80 |
+
raise FileNotFoundError(f"The file {path} does not exist.")
|
app/ingestion/text_splitter.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.documents import Document
|
| 2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
+
from uuid import uuid4
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
from app.utils.metadata_utils import MetadataService
|
| 8 |
+
from app.metadata_extraction.metadata_ext import MetadataExtractor
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
from typing import Type
|
| 11 |
+
from app.utils.metadata_utils import MetadataService
|
| 12 |
+
class splitting_text:
|
| 13 |
+
def __init__(self, documentTypeSchema:Type[BaseModel], llm=None):
|
| 14 |
+
self.llm = llm
|
| 15 |
+
self.metadata_extractor = MetadataExtractor(llm = self.llm)
|
| 16 |
+
self.metadata_services = MetadataService()
|
| 17 |
+
self.documentTypeSchema = documentTypeSchema
|
| 18 |
+
self.Keywordsfile_path = None
|
| 19 |
+
|
| 20 |
+
def _clean_text(self, text:str)-> str:
|
| 21 |
+
"""Clean extracted page content"""
|
| 22 |
+
# remove excessive whitespace
|
| 23 |
+
text = " ".join(text.split())
|
| 24 |
+
return text
|
| 25 |
+
|
| 26 |
+
def text_splitting(self, doc: List[Document]) -> List[Document]:
|
| 27 |
+
"""Split document into chunks for processing"""
|
| 28 |
+
|
| 29 |
+
all_chunks = []
|
| 30 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
|
| 31 |
+
for i, page in enumerate(doc):
|
| 32 |
+
# reset per page
|
| 33 |
+
try:
|
| 34 |
+
text = page.get_text()
|
| 35 |
+
except:
|
| 36 |
+
text = page.page_content
|
| 37 |
+
# print(type(page))
|
| 38 |
+
|
| 39 |
+
# text = self._clean_text(text)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
if i == 0:
|
| 44 |
+
output_folder = "app/data/"
|
| 45 |
+
filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
|
| 46 |
+
output_path = os.path.join(output_folder, filename)
|
| 47 |
+
self.Keywordsfile_path = output_path
|
| 48 |
+
# First page → extract + create JSON
|
| 49 |
+
Document_metadata = self.metadata_extractor.extractMetadata(document=page, known_keywords={}, metadata_class=self.documentTypeSchema)
|
| 50 |
+
extracted = Document_metadata.model_dump()
|
| 51 |
+
normalized = MetadataService.normalize_dict_to_lists(metadata = extracted)
|
| 52 |
+
|
| 53 |
+
with open(output_path, "w") as f:
|
| 54 |
+
json.dump(normalized, f, indent=4)
|
| 55 |
+
known_keywords = normalized
|
| 56 |
+
|
| 57 |
+
else:
|
| 58 |
+
# Next pages → load JSON and enforce consistency
|
| 59 |
+
with open(self.Keywordsfile_path, "r") as f:
|
| 60 |
+
known_keywords = json.load(f)
|
| 61 |
+
|
| 62 |
+
Document_metadata = self.metadata_extractor.extractMetadata(document=page, known_keywords=known_keywords, metadata_class=self.documentTypeSchema)
|
| 63 |
+
|
| 64 |
+
# check if there is new keyword is added or not during metadata extraction if yes then normalise(convert to dict) and then add new values into the keys exist
|
| 65 |
+
if Document_metadata.added_new_keyword:
|
| 66 |
+
new_data = self.metadata_services.normalize_dict_to_lists(
|
| 67 |
+
Document_metadata.model_dump(exclude_none= True)
|
| 68 |
+
)
|
| 69 |
+
for key,vals in new_data.items():
|
| 70 |
+
if isinstance(vals,list):
|
| 71 |
+
known_keywords[key] = list(set(known_keywords.get(key,[]) + vals)) #get the existing key and add vals and convert into set then list and update the file.
|
| 72 |
+
with open(self.Keywordsfile_path, "w") as f:
|
| 73 |
+
json.dump(known_keywords, f, indent=4)
|
| 74 |
+
|
| 75 |
+
# print(f"Document_metadata type: {type(Document_metadata)}")
|
| 76 |
+
extracted_metadata = Document_metadata.model_dump(exclude_none=True)
|
| 77 |
+
# print(f"extracted_metadata type: {type(extracted_metadata)}")
|
| 78 |
+
print(f"doc number: {i}")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if text.strip():
|
| 82 |
+
uuid = str(uuid4())
|
| 83 |
+
temp_doc = Document(
|
| 84 |
+
page_content=text,
|
| 85 |
+
metadata={
|
| 86 |
+
**page.metadata,
|
| 87 |
+
**extracted_metadata,
|
| 88 |
+
"page_no": i,
|
| 89 |
+
"doc_id": uuid,
|
| 90 |
+
"chunk_id": f"{uuid}_p{i}",
|
| 91 |
+
"type": "text"
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
+
chunks = splitter.split_documents([temp_doc])
|
| 95 |
+
all_chunks.extend(chunks)
|
| 96 |
+
|
| 97 |
+
return all_chunks
|
| 98 |
+
|
| 99 |
+
|
app/main.py
ADDED
|
File without changes
|
app/metadata_extraction/__init__.py
ADDED
|
File without changes
|
app/metadata_extraction/metadata_ext.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from langchain_core.exceptions import OutputParserException
|
| 3 |
+
from langchain_core.documents import Document
|
| 4 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 6 |
+
from typing import Type
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
# wrap parser with fixer once
|
| 9 |
+
# pydantic_parser = PydanticOutputParser(pydantic_object=InsuranceMetadata)
|
| 10 |
+
# fixing_parser = OutputFixingParser.from_llm(llm=llm, parser=pydantic_parser)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class MetadataExtractor:
|
| 14 |
+
def __init__(self, llm = None):
|
| 15 |
+
self.llm = llm
|
| 16 |
+
|
| 17 |
+
def extractMetadata_query(self, metadata_class : Type[BaseModel],document: Document, known_keywords: dict) -> BaseModel:
|
| 18 |
+
parser = PydanticOutputParser(pydantic_object=metadata_class)
|
| 19 |
+
|
| 20 |
+
schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
|
| 21 |
+
keywords_str = json.dumps(known_keywords, indent=2)
|
| 22 |
+
|
| 23 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 24 |
+
("system", """You are an information extraction system.
|
| 25 |
+
Extract only the required metadata from the user query using the existing known keywords.
|
| 26 |
+
|
| 27 |
+
⚠️ CRITICAL FORMATTING RULES:
|
| 28 |
+
- ALL fields must be arrays/lists, even if there's only one value
|
| 29 |
+
- For single values, wrap in brackets: "doc_id": ["single_value"]
|
| 30 |
+
- For multiple values: "coverage_type": ["value1", "value2", "value3"]
|
| 31 |
+
- For null/empty fields, use: null (not empty arrays)
|
| 32 |
+
|
| 33 |
+
⚠️ Content Rules:
|
| 34 |
+
- For exclusions and obligations, DO NOT copy full sentences.
|
| 35 |
+
- Instead, extract only concise normalized keywords (2–5 words max each).
|
| 36 |
+
- Use existing keywords if they already exist in the provided list.
|
| 37 |
+
- Prefer to reuse existing keywords if they are semantically the same.
|
| 38 |
+
- If you find a new keyword that is a sub-type or more specific variant of an existing one, keep both:
|
| 39 |
+
reuse the closest match from existing keywords, and also add the new one.
|
| 40 |
+
- In that case, set added_new_keyword=true.
|
| 41 |
+
- Do not include raw paragraphs in the output.
|
| 42 |
+
|
| 43 |
+
Schema you must follow:
|
| 44 |
+
{schema}
|
| 45 |
+
|
| 46 |
+
Existing Keywords:
|
| 47 |
+
{keywords}
|
| 48 |
+
"""),
|
| 49 |
+
("human", "Text:\n{document_content}")
|
| 50 |
+
])
|
| 51 |
+
chain = prompt | self.llm | parser
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
result = chain.invoke({
|
| 55 |
+
"schema": schema_str,
|
| 56 |
+
"keywords": keywords_str,
|
| 57 |
+
"document_content": document.page_content
|
| 58 |
+
})
|
| 59 |
+
return result
|
| 60 |
+
except OutputParserException as e:
|
| 61 |
+
print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
|
| 62 |
+
return metadata_class(added_new_keyword=False)
|
| 63 |
+
|
| 64 |
+
def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict) -> BaseModel:
|
| 65 |
+
parser = PydanticOutputParser(pydantic_object=metadata_class)
|
| 66 |
+
|
| 67 |
+
schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
|
| 68 |
+
keywords_str = json.dumps(known_keywords, indent=2)
|
| 69 |
+
|
| 70 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 71 |
+
("system", """You are an information extraction system.
|
| 72 |
+
Extract only the required metadata from the text according to schema given below.
|
| 73 |
+
|
| 74 |
+
⚠️ CRITICAL FORMATTING RULES:
|
| 75 |
+
- ALL fields must be arrays/lists, even if there's only one value
|
| 76 |
+
- For single values, wrap in brackets: "doc_id": ["single_value"]
|
| 77 |
+
- For multiple values: "coverage_type": ["value1", "value2", "value3"]
|
| 78 |
+
- For null/empty fields, use: null (not empty arrays)
|
| 79 |
+
|
| 80 |
+
⚠️ Content Rules:
|
| 81 |
+
- For exclusions and obligations, DO NOT copy full sentences.
|
| 82 |
+
- Instead, extract only concise normalized keywords (2–5 words max each).
|
| 83 |
+
- Use existing keywords if they already exist in the provided list.
|
| 84 |
+
- Prefer to reuse existing keywords if they are semantically the same.
|
| 85 |
+
- If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
|
| 86 |
+
*reuse the closest match from existing keywords*, and also add the new one.
|
| 87 |
+
- In that case, set `added_new_keyword=true`.
|
| 88 |
+
- Do not include raw paragraphs in the output.
|
| 89 |
+
|
| 90 |
+
Schema you must follow:
|
| 91 |
+
{schema}
|
| 92 |
+
|
| 93 |
+
Existing Keywords:
|
| 94 |
+
{keywords}
|
| 95 |
+
"""),
|
| 96 |
+
("human", "Text:\n{document_content}")
|
| 97 |
+
])
|
| 98 |
+
# - Use existing keywords if they already exist in the provided list.
|
| 99 |
+
# - Only create a new keyword if absolutely necessary, and set `added_new_keyword=true`.
|
| 100 |
+
# - New keywords must be short (1–3 words).
|
| 101 |
+
# - Do NOT invent different variations (e.g., if "Medical" already exists, do not output "Mediclaim Plus").
|
| 102 |
+
# - For list fields (like exclusions), reuse existing keywords where possible.
|
| 103 |
+
chain = prompt | self.llm | parser
|
| 104 |
+
|
| 105 |
+
try:
|
| 106 |
+
result = chain.invoke({
|
| 107 |
+
"schema": schema_str,
|
| 108 |
+
"keywords": keywords_str,
|
| 109 |
+
"document_content": document.page_content
|
| 110 |
+
})
|
| 111 |
+
return result
|
| 112 |
+
except OutputParserException as e:
|
| 113 |
+
print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
|
| 114 |
+
return metadata_class(added_new_keyword=False) # instantiate fallback
|
app/prompts/__init__.py
ADDED
|
File without changes
|
app/prompts/prompts.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
PARSER_PROMPT = f"""You receive a user's question about an insurance/contract document. Produce a JSON with keys:
|
| 2 |
+
- intent (one of: coverage_check, definition, limit_query, waiting_period, exclusions, other)
|
| 3 |
+
- entities (map of entity_name -> canonical string)
|
| 4 |
+
- constraints (map: plan, time_window, eligible_person, numerical_constraints)
|
| 5 |
+
- answer_type (one of: yes_no, short_explain, detailed, clause_list)
|
| 6 |
+
Return ONLY the JSON.Make sure that nested fields like "entities" and "constraints" are JSON objects, not strings.
|
| 7 |
+
"""
|
app/reseasoning/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/reseasoning/descision_maker.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.schemas.request_models import QuerySpec, LogicResult
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def evaluate_with_llm(raw_query: str, top_clauses: list, llm):
|
| 5 |
+
"""
|
| 6 |
+
Use the LLM to analyze retrieved clauses and return structured decision.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
# Prepare context for the prompt
|
| 10 |
+
context_clauses = []
|
| 11 |
+
for i, c in enumerate(top_clauses, 1):
|
| 12 |
+
context_clauses.append(f"{i}) [source:{c.doc_id} page:{c.page}] {c.text}")
|
| 13 |
+
print(chr(10).join(context_clauses))
|
| 14 |
+
|
| 15 |
+
# Build prompt
|
| 16 |
+
prompt = f"""
|
| 17 |
+
You are an insurance policy analyst. Question: "{raw_query}"
|
| 18 |
+
|
| 19 |
+
Provided clauses (numbered):
|
| 20 |
+
{chr(10).join(context_clauses)}
|
| 21 |
+
|
| 22 |
+
Task:
|
| 23 |
+
1) Decide: COVERED / NOT_COVERED / CONDITIONAL
|
| 24 |
+
2) Summarize the exact clause(s) that justify your decision.
|
| 25 |
+
3) List any conditions, waiting periods, sublimits, or exclusions relevant.
|
| 26 |
+
4) Provide a concise final answer (1-2 sentences).
|
| 27 |
+
|
| 28 |
+
Return JSON with these exact keys:
|
| 29 |
+
{{
|
| 30 |
+
"decision": "...",
|
| 31 |
+
"evidence": [
|
| 32 |
+
{{"doc_id": "...", "page": 0, "snippet": "...", "reason": "..."}}
|
| 33 |
+
],
|
| 34 |
+
"confidence": 0.0,
|
| 35 |
+
"rationale": "...",
|
| 36 |
+
"answer": "..."
|
| 37 |
+
}}
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
# Directly parse to LogicResult using structured output
|
| 41 |
+
structured_llm = llm.with_structured_output(LogicResult)
|
| 42 |
+
result: LogicResult = structured_llm.invoke(prompt)
|
| 43 |
+
# print(f"result: {result}\n result_type{type(result)}")
|
| 44 |
+
|
| 45 |
+
# Attach full text for each evidence
|
| 46 |
+
enriched_evidence = []
|
| 47 |
+
for ev in result.evidence:
|
| 48 |
+
matched = next((c for c in top_clauses if c.doc_id == ev.doc_id and str(c.page) == str(ev.page)), None)
|
| 49 |
+
if matched:
|
| 50 |
+
ev.text = matched.text # or use a different field if needed
|
| 51 |
+
enriched_evidence.append(ev)
|
| 52 |
+
|
| 53 |
+
result.evidence = enriched_evidence
|
| 54 |
+
# print(enriched_evidence[0])
|
| 55 |
+
return result
|
app/reseasoning/query_parser.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.utils.model_loader import ModelLoader
|
| 2 |
+
from app.schemas.request_models import QuerySpec
|
| 3 |
+
from app.prompts.prompts import PARSER_PROMPT
|
| 4 |
+
|
| 5 |
+
def parsing_query(query:str, llm) -> QuerySpec:
|
| 6 |
+
# Bind the schema to the model
|
| 7 |
+
# model_loader = ModelLoader(model_provider = "gemini")
|
| 8 |
+
# llm = model_loader.load_llm()
|
| 9 |
+
|
| 10 |
+
structured_llm = llm.with_structured_output(QuerySpec)
|
| 11 |
+
|
| 12 |
+
# Compose the full prompt with instructions and user question
|
| 13 |
+
full_prompt = PARSER_PROMPT + "\n" + query
|
| 14 |
+
|
| 15 |
+
# Invoke the model to get structured output parsed as QuerySpec
|
| 16 |
+
result: QuerySpec = structured_llm.invoke(full_prompt)
|
| 17 |
+
return result
|
| 18 |
+
# print(result.json()) # This will print the JSON output matching your schema
|
app/retrieval/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/retrieval/reranker.py
ADDED
|
File without changes
|
app/retrieval/retriever.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from app.schemas.request_models import ClauseHit
|
| 2 |
+
|
| 3 |
+
class Retriever:
|
| 4 |
+
def __init__(self, pinecone_index, query = None, metadata = None, namespace=None):
|
| 5 |
+
self.pinecone_index = pinecone_index
|
| 6 |
+
self.query = query
|
| 7 |
+
self.metadata = metadata
|
| 8 |
+
self.namespace = namespace
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def retrieval_from_pinecone_vectoreStore(self, top_k= 3):
|
| 12 |
+
"""
|
| 13 |
+
Retrieve the top matching chunks from Pinecone.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
pinecone_index: Your Pinecone index object.
|
| 17 |
+
embedding: The vector embedding of the query.
|
| 18 |
+
top_k: How many chunks to retrieve.
|
| 19 |
+
filter_meta: Optional metadata filter dict.
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
List of ClauseHit objects (lightweight container for chunk info).
|
| 23 |
+
"""
|
| 24 |
+
res = self.pinecone_index.query(
|
| 25 |
+
vector= self.query,
|
| 26 |
+
top_k =top_k ,
|
| 27 |
+
include_metadata = True,
|
| 28 |
+
include_values = False,
|
| 29 |
+
filter = self.metadata,
|
| 30 |
+
namespace = self.namespace
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Process the results into the expected format
|
| 34 |
+
# class ClauseHit:
|
| 35 |
+
# def __init__(self, doc_id, page, chunk_id, text, metadata, score):
|
| 36 |
+
# self.doc_id = doc_id
|
| 37 |
+
# self.page = page
|
| 38 |
+
# self.chunk_id = chunk_id
|
| 39 |
+
# self.text = text
|
| 40 |
+
# self.metadata = metadata
|
| 41 |
+
# self.score = score
|
| 42 |
+
|
| 43 |
+
# hits = []
|
| 44 |
+
# for match in res['matches']:
|
| 45 |
+
# hits.append(ClauseHit(
|
| 46 |
+
# doc_id=match['metadata'].get('doc_id', ''),
|
| 47 |
+
# page=match['metadata'].get('page_no', -1), # Use page_no instead of page
|
| 48 |
+
# chunk_id=match['metadata'].get('chunk_id', ''),
|
| 49 |
+
# text=match['metadata'].get('text', match.get('text', '')),
|
| 50 |
+
# metadata=match['metadata'],
|
| 51 |
+
# score=match['score']
|
| 52 |
+
# ))
|
| 53 |
+
# return hits
|
| 54 |
+
return res
|
| 55 |
+
|
app/schemas/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/schemas/metadata_schema.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, model_validator,field_validator, HttpUrl, Field
|
| 2 |
+
from typing import List, Dict, Any, Optional, Union, Literal
|
| 3 |
+
|
| 4 |
+
class CommonMetaData(BaseModel):
|
| 5 |
+
# --- Common metadata (across all domains) ---
|
| 6 |
+
doc_id: Optional[List[str]] = Field(None, description="Unique document identifier")
|
| 7 |
+
doc_category: Optional[List[str]] = Field(None, description="General pool/category e.g. Insurance, HR, Legal")
|
| 8 |
+
doc_type: Optional[List[str]] = Field(None, description="Specific type e.g. Policy doc, Contract, Handbook")
|
| 9 |
+
jurisdiction: Optional[List[str]] = Field(
|
| 10 |
+
default=None, description="Applicable jurisdictions/regions/countries"
|
| 11 |
+
)
|
| 12 |
+
effective_date: Optional[List[str]] = Field(None, description="Date from which the document is effective")
|
| 13 |
+
expiry_date: Optional[List[str]] = Field(None, description="Date until which the document is valid")
|
| 14 |
+
parties: Optional[List[str]] = Field(None, description="Involved parties (e.g., employer/employee, insurer/insured)")
|
| 15 |
+
# obligations: Optional[List[str]] = Field(
|
| 16 |
+
# default=None,
|
| 17 |
+
# description="List of short, normalized obligation keywords (2–5 words each, no full sentences)"
|
| 18 |
+
# )
|
| 19 |
+
penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
|
| 20 |
+
notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
|
| 21 |
+
|
| 22 |
+
class InsuranceMetadata(CommonMetaData):
|
| 23 |
+
|
| 24 |
+
# --- Insurance ---
|
| 25 |
+
policy_number: Optional[List[str]] = None
|
| 26 |
+
coverage_type: Optional[List[str]] = Field(
|
| 27 |
+
default=None,
|
| 28 |
+
description="Type(s) of coverage. Short keywords (1–3 words each)."
|
| 29 |
+
)
|
| 30 |
+
premium_amount: Optional[List[str]] = None
|
| 31 |
+
exclusions: Optional[List[str]] = Field(
|
| 32 |
+
description="List of normalized keywords representing exclusions (short, 2-5 words each, not full paragraphs).", default=None
|
| 33 |
+
)
|
| 34 |
+
added_new_keyword: bool = False
|
| 35 |
+
|
| 36 |
+
class HRMetadata(CommonMetaData):
|
| 37 |
+
# --- HR / Employment ---
|
| 38 |
+
policy_type: Optional[str] = None
|
| 39 |
+
applicable_roles: Optional[List[str]] = None
|
| 40 |
+
notice_period: Optional[str] = None
|
| 41 |
+
|
| 42 |
+
class LegalMetadata(CommonMetaData):
|
| 43 |
+
|
| 44 |
+
# --- Legal / Compliance ---
|
| 45 |
+
clause_type: Optional[str] = None
|
| 46 |
+
governing_law: Optional[str] = None
|
| 47 |
+
duration: Optional[str] = None
|
| 48 |
+
|
| 49 |
+
class FinancialMetadata(CommonMetaData):
|
| 50 |
+
|
| 51 |
+
# --- Financial / Regulatory ---
|
| 52 |
+
section: Optional[str] = None
|
| 53 |
+
compliance_requirement: Optional[str] = None
|
| 54 |
+
reporting_frequency: Optional[str] = None
|
| 55 |
+
|
| 56 |
+
class HealthcareMetadata(CommonMetaData):
|
| 57 |
+
|
| 58 |
+
# --- Healthcare / Pharma ---
|
| 59 |
+
disease: Optional[str] = None
|
| 60 |
+
treatment_limit: Optional[str] = None
|
| 61 |
+
validity_period: Optional[str] = None
|
| 62 |
+
|
| 63 |
+
class ProcurementMetadata(CommonMetaData):
|
| 64 |
+
|
| 65 |
+
# --- Procurement / Vendor Management ---
|
| 66 |
+
vendor_name: Optional[str] = None
|
| 67 |
+
contract_value: Optional[str] = None
|
| 68 |
+
payment_terms: Optional[str] = None
|
| 69 |
+
sla_metrics: Optional[List[str]] = None
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
app/schemas/request_models.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, model_validator,field_validator, HttpUrl, Field
|
| 2 |
+
from typing import List, Dict, Any, Optional, Union, Literal
|
| 3 |
+
import json
|
| 4 |
+
class QueryRequest(BaseModel):
|
| 5 |
+
query: str
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# class QuerySpec(BaseModel):
|
| 12 |
+
# raw_query: str # query of the user
|
| 13 |
+
# intent: str # High-level purpose, e.g., "coverage_check" — helps routing aur rules.
|
| 14 |
+
# entities: Dict[str, Union[str, List[str]]] = Field(default_factory= dict) # Extracted items (policy number, dates, amounts) — structured
|
| 15 |
+
# constraints : Dict[str, Any] = Field(default_factory=dict) # filters like {"jurisdiction":"IN","incident_date":"2024-01-01"}
|
| 16 |
+
# answer_type: Optional[str] = "detailed"
|
| 17 |
+
# followups: Optional[List[str]] = Field(default_factory=list) # followups for user
|
| 18 |
+
|
| 19 |
+
# @model_validator(mode = "before")
|
| 20 |
+
# @classmethod
|
| 21 |
+
# def parse_nested_json(cls, values): # parsing nested json to load
|
| 22 |
+
# for field in ['entities', 'constraints']:
|
| 23 |
+
# val = values.get(field)
|
| 24 |
+
# if isinstance(val, str):
|
| 25 |
+
# try:
|
| 26 |
+
# values[field] = json.loads(val)
|
| 27 |
+
# except json.JSONDecodeError:
|
| 28 |
+
# pass
|
| 29 |
+
# return values
|
| 30 |
+
|
| 31 |
+
# class ClauseHit(BaseModel):
|
| 32 |
+
# doc_id : str # id of the document
|
| 33 |
+
# page: int # pdf page id
|
| 34 |
+
# chunk_id: str
|
| 35 |
+
# text: str # Evidence text used for answer.
|
| 36 |
+
# metadata: Dict[str, Any] = Field(default_factory=dict) # metadata
|
| 37 |
+
# score: float # Retrieval similarity score
|
| 38 |
+
# boost: Optional[float] = None
|
| 39 |
+
# combined_score: Optional[float] = None
|
| 40 |
+
|
| 41 |
+
# @field_validator("metadata", mode="before")
|
| 42 |
+
# def parse_metadata(cls, v):
|
| 43 |
+
# if isinstance(v, str):
|
| 44 |
+
# try:
|
| 45 |
+
# return json.loads(v) if v.strip() else {}
|
| 46 |
+
# except json.JSONDecodeError:
|
| 47 |
+
# return {}
|
| 48 |
+
# return v
|
| 49 |
+
|
| 50 |
+
# class LogicResult(BaseModel):
|
| 51 |
+
# answer: str
|
| 52 |
+
# decision: str # "covered"/"not_covered"/"conditional"
|
| 53 |
+
# confidence: float # 0..1 score for calibration/thresholding.
|
| 54 |
+
# evidence: List[ClauseHit] = Field(default_factory=list) # List of ClauseHit used to justify the answer.
|
| 55 |
+
# rationale: Optional[str] = None # Short human-readable reason (audit-friendly).
|
| 56 |
+
|
| 57 |
+
# class HackRxRunRequest(BaseModel):
|
| 58 |
+
# documents: HttpUrl = Field(
|
| 59 |
+
# ...,
|
| 60 |
+
# description="URL to the document (PDF, DOCX, or email blob)"
|
| 61 |
+
# )
|
| 62 |
+
# questions: List[str] = Field(
|
| 63 |
+
# ...,
|
| 64 |
+
# description="List of questions to query against the document"
|
| 65 |
+
# )
|
| 66 |
+
|
| 67 |
+
class DocumentTypeSchema(BaseModel):
|
| 68 |
+
document_types: Literal[
|
| 69 |
+
"HR/Employment",
|
| 70 |
+
"Insurance",
|
| 71 |
+
"Legal/Compliance",
|
| 72 |
+
"Financial/Regulatory",
|
| 73 |
+
"Government/Public Policy",
|
| 74 |
+
"Technical/IT Policies"
|
| 75 |
+
] = Field(..., description="The category/type of the document")
|
| 76 |
+
|
| 77 |
+
|
app/schemas/response_models.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
+
|
| 4 |
+
class SourceDocument(BaseModel):
|
| 5 |
+
doc_id: str
|
| 6 |
+
page: int
|
| 7 |
+
text: str
|
| 8 |
+
score: float
|
| 9 |
+
metadata: Dict[str, Any]
|
| 10 |
+
|
| 11 |
+
class QueryResponse(BaseModel):
|
| 12 |
+
session_id: str
|
| 13 |
+
query: str
|
| 14 |
+
answer: str
|
| 15 |
+
query_metadata: Optional[Dict[str, Any]] = None
|
| 16 |
+
sources: Optional[List[SourceDocument]] = None
|
| 17 |
+
message: str
|
| 18 |
+
|
| 19 |
+
class SessionResponse(BaseModel):
|
| 20 |
+
session_id: str
|
| 21 |
+
message: str
|
| 22 |
+
|
| 23 |
+
class UploadResponse(BaseModel):
|
| 24 |
+
session_id: str
|
| 25 |
+
filename: str
|
| 26 |
+
document_type: str
|
| 27 |
+
chunks_created: int
|
| 28 |
+
message: str
|
| 29 |
+
|
| 30 |
+
class ErrorResponse(BaseModel):
|
| 31 |
+
detail: str
|
| 32 |
+
error_code: Optional[str] = None
|
app/services/RAG_service.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from app.utils.model_loader import ModelLoader
|
| 3 |
+
from app.ingestion.file_loader import FileLoader
|
| 4 |
+
from app.ingestion.text_splitter import splitting_text
|
| 5 |
+
from app.retrieval.retriever import Retriever
|
| 6 |
+
from app.embedding.embeder import QueryEmbedding
|
| 7 |
+
from app.embedding.vectore_store import VectorStore
|
| 8 |
+
from app.metadata_extraction.metadata_ext import MetadataExtractor
|
| 9 |
+
from app.utils.metadata_utils import MetadataService
|
| 10 |
+
# from app.utils.document_op import DocumentOperation
|
| 11 |
+
from langchain_core.documents import Document
|
| 12 |
+
import json
|
| 13 |
+
from typing import List, Optional
|
| 14 |
+
# ...existing imports...
|
| 15 |
+
|
| 16 |
+
# Global model instances (loaded once)
|
| 17 |
+
_embedding_model = None
|
| 18 |
+
|
| 19 |
+
def get_models():
|
| 20 |
+
global _embedding_model
|
| 21 |
+
if _embedding_model is None:
|
| 22 |
+
print("Loading models (one-time initialization)...")
|
| 23 |
+
embedding_loader = ModelLoader(model_provider="huggingface")
|
| 24 |
+
_embedding_model = embedding_loader.load_llm()
|
| 25 |
+
return _embedding_model
|
| 26 |
+
|
| 27 |
+
class RAGService:
|
| 28 |
+
def __init__(self):
|
| 29 |
+
print("[RAGService] Initializing service...")
|
| 30 |
+
self._init_models()
|
| 31 |
+
self.Docuement_Type = None
|
| 32 |
+
self.Pinecone_index = None
|
| 33 |
+
self.Document_path = None
|
| 34 |
+
self.Document_Type = None
|
| 35 |
+
self.DocumentTypeScheme = None
|
| 36 |
+
self.url = None
|
| 37 |
+
self.chunks = None
|
| 38 |
+
self.vector_store = None
|
| 39 |
+
self.index = None
|
| 40 |
+
self.namespace = None
|
| 41 |
+
self.retriever = None
|
| 42 |
+
self.metadataservice = MetadataService()
|
| 43 |
+
print("[RAGService] Initialization complete.")
|
| 44 |
+
|
| 45 |
+
def _init_models(self):
|
| 46 |
+
"""Initialize LLM and embedding Models"""
|
| 47 |
+
print("[RAGService] Loading LLM model (gemini)...")
|
| 48 |
+
self.model_loader = ModelLoader(model_provider="gemini")
|
| 49 |
+
self.llm = self.model_loader.load_llm()
|
| 50 |
+
print("[RAGService] LLM model loaded.")
|
| 51 |
+
print("[RAGService] Loading embedding model (huggingface)...")
|
| 52 |
+
# self.model_loader = ModelLoader(model_provider="huggingface")
|
| 53 |
+
self.embedding_model = get_models()
|
| 54 |
+
print("[RAGService] Embedding model loaded.")
|
| 55 |
+
|
| 56 |
+
def load_and_split_document(self, type:str, path:str= None, url:str = None):
|
| 57 |
+
"""Load and chunk document from local path or URL"""
|
| 58 |
+
print(f"[RAGService] Loading document. Type: {type}, Path: {path}, URL: {url}")
|
| 59 |
+
file_loader = FileLoader(llm = self.llm)
|
| 60 |
+
if type == "pdf":
|
| 61 |
+
if path:
|
| 62 |
+
print(f"[RAGService] Loading PDF from path: {path}")
|
| 63 |
+
doc = file_loader.load_pdf(path)
|
| 64 |
+
elif url:
|
| 65 |
+
print(f"[RAGService] Loading PDF from URL: {url}")
|
| 66 |
+
doc = file_loader.load_documents_from_url(url)
|
| 67 |
+
else:
|
| 68 |
+
print("[RAGService] Error: Either path or url must be provided for PDF.")
|
| 69 |
+
raise ValueError("Either path or url must be provided for PDF.")
|
| 70 |
+
elif type == "word":
|
| 71 |
+
if path:
|
| 72 |
+
print(f"[RAGService] Loading Word document from path: {path}")
|
| 73 |
+
doc = file_loader.load_word_document(path)
|
| 74 |
+
elif url:
|
| 75 |
+
print("[RAGService] Error: URL loading not supported for Word documents.")
|
| 76 |
+
raise ValueError("URL loading not supported for Word documents.")
|
| 77 |
+
else:
|
| 78 |
+
print("[RAGService] Error: Path must be provided for Word document.")
|
| 79 |
+
raise ValueError("Path must be provided for Word document.")
|
| 80 |
+
else:
|
| 81 |
+
print("[RAGService] Error: Unsupported document type.")
|
| 82 |
+
raise ValueError("Unsupported document type. Use 'pdf' or 'word'.")
|
| 83 |
+
|
| 84 |
+
print("[RAGService] Detecting document type scheme...")
|
| 85 |
+
self.DocumentTypeScheme = file_loader.detect_document_type(doc[0:2])
|
| 86 |
+
print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
|
| 87 |
+
self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
|
| 88 |
+
print(f"[RAGService] Document type model: {self.Document_Type}")
|
| 89 |
+
self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm)
|
| 90 |
+
print("[RAGService] Splitting document into chunks...")
|
| 91 |
+
self.chunks = self.splitter.text_splitting(doc)
|
| 92 |
+
print(f"[RAGService] Total chunks created: {len(self.chunks)}")
|
| 93 |
+
|
| 94 |
+
def create_query_embedding(self, query: str):
|
| 95 |
+
print("[RAGService] Creating query embedding...")
|
| 96 |
+
self.query_embedder = QueryEmbedding(query=query, embedding_model=self.embedding_model)
|
| 97 |
+
self.query_embedding = self.query_embedder.get_embedding()
|
| 98 |
+
print(f"[RAGService] Query embedding created: {self.query_embedding}")
|
| 99 |
+
langchain_doc = Document(page_content=query)
|
| 100 |
+
print("[RAGService] Extracting metadata for the query...")
|
| 101 |
+
self.metadataExtractor = MetadataExtractor(llm=self.llm)
|
| 102 |
+
with open(self.splitter.Keywordsfile_path, "r") as f:
|
| 103 |
+
known_keywords = json.load(f)
|
| 104 |
+
raw_metadata = self.metadataExtractor.extractMetadata_query(self.Document_Type,langchain_doc, known_keywords = known_keywords)
|
| 105 |
+
print(f"[RAGService] Query metadata extracted: {raw_metadata}")
|
| 106 |
+
# Convert to dictionary and format for Pinecone
|
| 107 |
+
metadata_dict = raw_metadata.model_dump(exclude_none=True)
|
| 108 |
+
formatted_metadata = self.metadataservice.format_metadata_for_pinecone(metadata_dict)
|
| 109 |
+
|
| 110 |
+
# Remove problematic fields that cause serialization issues
|
| 111 |
+
self.query_metadata = {
|
| 112 |
+
k: v for k, v in formatted_metadata.items()
|
| 113 |
+
if k not in ["obligations", "exclusions", "notes", "added_new_keyword"]
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
print(f"[RAGService] Query metadata type: {type(self.query_metadata)}")
|
| 117 |
+
print(f"[RAGService] Query metadata: {self.query_metadata}")
|
| 118 |
+
|
| 119 |
+
def create_vector_store(self):
|
| 120 |
+
print("[RAGService] Creating vector store...")
|
| 121 |
+
self.vector_store = VectorStore(self.chunks, self.embedding_model)
|
| 122 |
+
self.index, self.namespace = self.vector_store.create_vectorestore()
|
| 123 |
+
print(f"[RAGService] Vector store created. Index: {self.index}, Namespace: {self.namespace}")
|
| 124 |
+
|
| 125 |
+
def retrive_documents(self):
|
| 126 |
+
print("[RAGService] Retrieving documents from vector store...")
|
| 127 |
+
self.retriever = Retriever(self.index,self.query_embedding,self.query_metadata, self.namespace)
|
| 128 |
+
self.result = self.retriever.retrieval_from_pinecone_vectoreStore()
|
| 129 |
+
print(f"[RAGService] Retrieval result: {self.result}")
|
| 130 |
+
|
| 131 |
+
def answer_query(self, raw_query:str) -> str:
|
| 132 |
+
"""Answer user query using retrieved documents and LLM"""
|
| 133 |
+
print(f"[RAGService] Answering query: {raw_query}")
|
| 134 |
+
top_clause = self.result['matches']
|
| 135 |
+
top_clause_dicts = [r.to_dict() for r in top_clause]
|
| 136 |
+
self.top_clauses = top_clause_dicts
|
| 137 |
+
keys_to_remove = {"file_path", "source", "producer", "keywords", "subject", "added_new_keyword", "author", "chunk_id"}
|
| 138 |
+
for r in top_clause_dicts:
|
| 139 |
+
meta = r.get("metadata", {})
|
| 140 |
+
for k in keys_to_remove:
|
| 141 |
+
meta.pop(k, None)
|
| 142 |
+
|
| 143 |
+
context_clauses = json.dumps(top_clause_dicts, separators=(",", ":"))
|
| 144 |
+
|
| 145 |
+
print(f"context_clauses: {context_clauses}")
|
| 146 |
+
|
| 147 |
+
prompt = f"""
|
| 148 |
+
You are a legal/insurance domain expert and policy analyst.
|
| 149 |
+
Use the following extracted clauses from policy documents to answer the question.
|
| 150 |
+
If you can't find the answer, say "I don't know".
|
| 151 |
+
Context clauses:
|
| 152 |
+
{"".join(context_clauses)}
|
| 153 |
+
Question: {raw_query}
|
| 154 |
+
"""
|
| 155 |
+
print("[RAGService] Invoking LLM with prompt...")
|
| 156 |
+
response = self.llm.invoke(prompt)
|
| 157 |
+
print(f"[RAGService] LLM response: {response}")
|
| 158 |
+
|
| 159 |
+
# Extract string content from response object
|
| 160 |
+
if hasattr(response, 'content'):
|
| 161 |
+
return response.content
|
| 162 |
+
elif isinstance(response, str):
|
| 163 |
+
return response
|
| 164 |
+
else:
|
| 165 |
+
return str(response)
|
app/services/__init__.py
ADDED
|
File without changes
|
app/utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# This file is automatically created to mark the directory as a package.
|
app/utils/config_loader.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def load_config(config_path: str = "app/config/config.yaml") -> dict:
|
| 5 |
+
with open(config_path, "r") as file:
|
| 6 |
+
config = yaml.safe_load(file)
|
| 7 |
+
# print(config)
|
| 8 |
+
return config
|
app/utils/document_op.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
class DocumentOperation:
|
| 3 |
+
@staticmethod
|
| 4 |
+
def get_file_type_by_extension(filename):
|
| 5 |
+
_, extension = os.path.splitext(filename)
|
| 6 |
+
extension = extension.lower()
|
| 7 |
+
if extension == ".txt":
|
| 8 |
+
return "text"
|
| 9 |
+
elif extension == ".pdf":
|
| 10 |
+
return "pdf"
|
| 11 |
+
elif extension in [".doc", ".docx"]:
|
| 12 |
+
return "word"
|
| 13 |
+
else:
|
| 14 |
+
return "unknown"
|
app/utils/embedding_manager.py
ADDED
|
File without changes
|
app/utils/logger.py
ADDED
|
File without changes
|
app/utils/metadata_utils.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData
|
| 2 |
+
from app.schemas.request_models import DocumentTypeSchema
|
| 3 |
+
class MetadataService:
|
| 4 |
+
def __init__(self):
|
| 5 |
+
self.metadata_models = {
|
| 6 |
+
"Insurance": InsuranceMetadata,
|
| 7 |
+
"HR/Employment": CommonMetaData,
|
| 8 |
+
"Legal/Compliance": CommonMetaData,
|
| 9 |
+
"Financial/Regulatory": CommonMetaData,
|
| 10 |
+
"Government/Public Policy": CommonMetaData,
|
| 11 |
+
"Technical/IT Policies": CommonMetaData
|
| 12 |
+
}
|
| 13 |
+
@staticmethod
|
| 14 |
+
def format_metadata_for_pinecone(metadata: dict) -> dict:
|
| 15 |
+
"""
|
| 16 |
+
Convert list fields in metadata to Pinecone's valid filter format using $in.
|
| 17 |
+
"""
|
| 18 |
+
formatted = {}
|
| 19 |
+
for key, value in metadata.items():
|
| 20 |
+
if isinstance(value, list):
|
| 21 |
+
if len(value) > 0:
|
| 22 |
+
formatted[key] = {"$in": value}
|
| 23 |
+
else:
|
| 24 |
+
formatted[key] = value
|
| 25 |
+
return formatted
|
| 26 |
+
|
| 27 |
+
def Return_document_model(self, doc_type_schema: DocumentTypeSchema):
|
| 28 |
+
"""
|
| 29 |
+
Returns appropriate metadata model based on document type
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
doc_type_schema: DocumentTypeSchema object containing document type
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Appropriate Pydantic model class for the document type
|
| 36 |
+
"""
|
| 37 |
+
doc_type = doc_type_schema.document_types
|
| 38 |
+
return self.metadata_models.get(doc_type, CommonMetaData)
|
| 39 |
+
|
| 40 |
+
@staticmethod
|
| 41 |
+
def normalize_dict_to_lists(metadata: dict) -> dict:
|
| 42 |
+
"""Convert dict values to lists if they aren't already"""
|
| 43 |
+
normalized = {}
|
| 44 |
+
for key, value in metadata.items():
|
| 45 |
+
if value is None:
|
| 46 |
+
normalized[key] = []
|
| 47 |
+
elif isinstance(value, list):
|
| 48 |
+
normalized[key] = value
|
| 49 |
+
else:
|
| 50 |
+
normalized[key] = [value]
|
| 51 |
+
return normalized
|
app/utils/model_loader.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# from config_loader import
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
from typing import Literal, Optional,Any
|
| 6 |
+
from app.utils.config_loader import load_config
|
| 7 |
+
from langchain_groq import ChatGroq
|
| 8 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 11 |
+
# from langchain_openai import OpenAIEmbeddings
|
| 12 |
+
from langchain_community.embeddings import OpenAIEmbeddings
|
| 13 |
+
class ConfigLoader:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
print(f"Loading config....")
|
| 16 |
+
self.config = load_config()
|
| 17 |
+
|
| 18 |
+
def __getitem__(self,key):## This method allows you to access config values using dictionary-like syntax
|
| 19 |
+
return self.config[key]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ModelLoader(BaseModel):
|
| 23 |
+
model_provider: Literal["groq", "gemini", "openai","gemini_lite", "huggingface"] = "gemini"
|
| 24 |
+
config: Optional[ConfigLoader] = Field(default = None, exclude = True) # either the config is ConfigLoader object or None
|
| 25 |
+
|
| 26 |
+
def model_post_init(self, __context: Any)->None:
|
| 27 |
+
self.config = ConfigLoader() # Automatically ensures that whenever you create ModelLoader, it loads the config.. model_post_init is a Pydantic V2 hook, which runs after model creation.It assigns a ConfigLoader() instance to self.config.This ensures the configuration is loaded whenever you create a ModelLoader.
|
| 28 |
+
|
| 29 |
+
class Config:
|
| 30 |
+
arbitrary_types_allowed = True # Allows ConfigLoader (a non-Pydantic class) to be used as a field in the model.
|
| 31 |
+
|
| 32 |
+
def load_llm(self):
|
| 33 |
+
"""
|
| 34 |
+
Load and return the LLM model
|
| 35 |
+
"""
|
| 36 |
+
print("LLM loading...")
|
| 37 |
+
print("Loading model from provider: ")
|
| 38 |
+
if self.model_provider == "groq":
|
| 39 |
+
print("Loading model from GROQ:")
|
| 40 |
+
groq_api_key = os.getenv("GROQ_API_KEY")
|
| 41 |
+
model_name = self.config["llm"]["groq"]["model_name"]
|
| 42 |
+
llm = ChatGroq(model = model_name, api_key = groq_api_key)
|
| 43 |
+
|
| 44 |
+
elif self.model_provider =="gemini":
|
| 45 |
+
print("Loading model from gemini:")
|
| 46 |
+
load_dotenv()
|
| 47 |
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
| 48 |
+
model_name = self.config["llm"]["gemini"]["model_name"]
|
| 49 |
+
llm = ChatGoogleGenerativeAI(
|
| 50 |
+
model=model_name,
|
| 51 |
+
google_api_key= gemini_api_key
|
| 52 |
+
)
|
| 53 |
+
elif self.model_provider =="gemini_lite":
|
| 54 |
+
print("Loading model from gemini-flash-lite:")
|
| 55 |
+
load_dotenv()
|
| 56 |
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
| 57 |
+
model_name = self.config["llm"]["gemini_lite"]["model_name"]
|
| 58 |
+
llm = ChatGoogleGenerativeAI(
|
| 59 |
+
model=model_name,
|
| 60 |
+
google_api_key= gemini_api_key
|
| 61 |
+
)
|
| 62 |
+
elif self.model_provider =="openai":
|
| 63 |
+
load_dotenv()
|
| 64 |
+
print("Loading model from openai:")
|
| 65 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 66 |
+
model_name = self.config["embedding_model"]["openai"]["model_name"]
|
| 67 |
+
llm = OpenAIEmbeddings(model=model_name, api_key = api_key)
|
| 68 |
+
elif self.model_provider =="huggingface":
|
| 69 |
+
load_dotenv()
|
| 70 |
+
print("Loading model from huggingface:")
|
| 71 |
+
print("HF_TOKEN in env:", os.getenv("HF_TOKEN"))
|
| 72 |
+
api_key = os.getenv("HF_TOKEN")
|
| 73 |
+
print(f"HF api key {api_key}")
|
| 74 |
+
os.environ["HF_TOKEN"] = api_key # Ensure the token is set in the environment
|
| 75 |
+
model_name = self.config["embedding_model"]["huggingface"]["model_name"]
|
| 76 |
+
llm = HuggingFaceEmbeddings(model=model_name)
|
| 77 |
+
else:
|
| 78 |
+
raise ValueError(f"Unsupported model provider: {self.model_provider}")
|
| 79 |
+
return llm
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
experiments.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
main.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from fastapi.responses import JSONResponse
|
| 4 |
+
import uvicorn
|
| 5 |
+
import os
|
| 6 |
+
import tempfile
|
| 7 |
+
from typing import Optional
|
| 8 |
+
import uuid
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
from app.api.v1.routes import router as api_router
|
| 12 |
+
from app.core.session_manager import session_manager
|
| 13 |
+
from app.config.config import get_settings
|
| 14 |
+
|
| 15 |
+
# Initialize FastAPI app
|
| 16 |
+
app = FastAPI(
|
| 17 |
+
title="ClariDoc API",
|
| 18 |
+
description="Professional Document Analysis & RAG Platform API",
|
| 19 |
+
version="1.0.0"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Add CORS middleware
|
| 23 |
+
app.add_middleware(
|
| 24 |
+
CORSMiddleware,
|
| 25 |
+
allow_origins=["*"], # Configure this properly for production
|
| 26 |
+
allow_credentials=True,
|
| 27 |
+
allow_methods=["*"],
|
| 28 |
+
allow_headers=["*"],
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
@app.on_event("startup")
|
| 32 |
+
async def startup_event():
|
| 33 |
+
"""Initialize database and other startup tasks"""
|
| 34 |
+
try:
|
| 35 |
+
# Test database connection
|
| 36 |
+
session_manager.db.init_db()
|
| 37 |
+
print("Database connection verified successfully")
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"Warning: Database initialization failed: {e}")
|
| 40 |
+
|
| 41 |
+
# Include API routes
|
| 42 |
+
app.include_router(api_router, prefix="/api/v1")
|
| 43 |
+
|
| 44 |
+
@app.get("/")
|
| 45 |
+
async def root():
|
| 46 |
+
return {
|
| 47 |
+
"message": "ClariDoc API",
|
| 48 |
+
"status": "running",
|
| 49 |
+
"description": "Professional Document Analysis & RAG Platform"
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
@app.get("/health")
|
| 53 |
+
async def health_check():
|
| 54 |
+
"""Health check endpoint for Docker and monitoring"""
|
| 55 |
+
try:
|
| 56 |
+
# Test database connection
|
| 57 |
+
session_manager.db.init_db()
|
| 58 |
+
db_status = "healthy"
|
| 59 |
+
except Exception:
|
| 60 |
+
db_status = "unhealthy"
|
| 61 |
+
|
| 62 |
+
return {
|
| 63 |
+
"status": "healthy",
|
| 64 |
+
"service": "ClariDoc FastAPI Backend",
|
| 65 |
+
"database": db_status,
|
| 66 |
+
"timestamp": datetime.now()
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
uvicorn.run(
|
| 71 |
+
"app.main:app",
|
| 72 |
+
host="0.0.0.0",
|
| 73 |
+
port=int(os.getenv("PORT", 8000)),
|
| 74 |
+
reload=False, # Disable reload in production
|
| 75 |
+
log_level="info"
|
| 76 |
+
)
|
pyproject.toml
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "rag-app"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Add your description here"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = "==3.12.4"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"acres==0.5.0",
|
| 9 |
+
"aiohappyeyeballs==2.6.1",
|
| 10 |
+
"aiohttp==3.12.15",
|
| 11 |
+
"aiohttp-retry==2.9.1",
|
| 12 |
+
"aiosignal==1.4.0",
|
| 13 |
+
"annotated-types==0.7.0",
|
| 14 |
+
"anyio==4.10.0",
|
| 15 |
+
"asttokens==3.0.0",
|
| 16 |
+
"attrs==25.3.0",
|
| 17 |
+
"beautifulsoup4==4.13.4",
|
| 18 |
+
"cachetools==5.5.2",
|
| 19 |
+
"certifi==2025.8.3",
|
| 20 |
+
"cffi==1.17.1",
|
| 21 |
+
"charset-normalizer==3.4.3",
|
| 22 |
+
"ci-info==0.3.0",
|
| 23 |
+
"click==8.2.1",
|
| 24 |
+
"colorama==0.4.6",
|
| 25 |
+
"colorclass==2.2.2",
|
| 26 |
+
"comm==0.2.3",
|
| 27 |
+
"compressed-rtf==1.0.7",
|
| 28 |
+
"configobj==5.0.9",
|
| 29 |
+
"configparser==7.2.0",
|
| 30 |
+
"cryptography==45.0.6",
|
| 31 |
+
"dataclasses-json==0.6.7",
|
| 32 |
+
"debugpy==1.8.16",
|
| 33 |
+
"decorator==5.2.1",
|
| 34 |
+
"distro==1.9.0",
|
| 35 |
+
"docx==0.2.4",
|
| 36 |
+
"docx2txt>=0.9",
|
| 37 |
+
"easygui==0.98.3",
|
| 38 |
+
"ebcdic==1.1.1",
|
| 39 |
+
"etelemetry==0.3.1",
|
| 40 |
+
"executing==2.2.0",
|
| 41 |
+
"extract-msg==0.55.0",
|
| 42 |
+
"fastapi==0.116.1",
|
| 43 |
+
"filelock==3.18.0",
|
| 44 |
+
"filetype==1.2.0",
|
| 45 |
+
"frozenlist==1.7.0",
|
| 46 |
+
"google-ai-generativelanguage==0.6.18",
|
| 47 |
+
"google-api-core==2.25.1",
|
| 48 |
+
"google-auth==2.40.3",
|
| 49 |
+
"googleapis-common-protos==1.70.0",
|
| 50 |
+
"greenlet==3.2.4",
|
| 51 |
+
"groq==0.31.0",
|
| 52 |
+
"grpcio==1.74.0",
|
| 53 |
+
"grpcio-status==1.74.0",
|
| 54 |
+
"h11==0.16.0",
|
| 55 |
+
"hf-xet>=1.1.8",
|
| 56 |
+
"httpcore==1.0.9",
|
| 57 |
+
"httplib2==0.22.0",
|
| 58 |
+
"httpx==0.28.1",
|
| 59 |
+
"httpx-sse==0.4.1",
|
| 60 |
+
"huggingface-hub[cli]>=0.34.4",
|
| 61 |
+
"idna==3.10",
|
| 62 |
+
"iniconfig==2.1.0",
|
| 63 |
+
"ipykernel==6.30.1",
|
| 64 |
+
"ipython==9.4.0",
|
| 65 |
+
"ipython-pygments-lexers==1.1.1",
|
| 66 |
+
"jedi==0.19.2",
|
| 67 |
+
"jiter==0.10.0",
|
| 68 |
+
"joblib==1.5.1",
|
| 69 |
+
"jsonpatch==1.33",
|
| 70 |
+
"jsonpointer==3.0.0",
|
| 71 |
+
"jupyter-client==8.6.3",
|
| 72 |
+
"jupyter-core==5.8.1",
|
| 73 |
+
"langchain==0.3.27",
|
| 74 |
+
"langchain-community==0.3.27",
|
| 75 |
+
"langchain-core==0.3.74",
|
| 76 |
+
"langchain-google-genai==2.1.9",
|
| 77 |
+
"langchain-groq==0.3.7",
|
| 78 |
+
"langchain-huggingface>=0.3.1",
|
| 79 |
+
"langchain-openai==0.3.29",
|
| 80 |
+
"langchain-pinecone==0.2.11",
|
| 81 |
+
"langchain-tests==0.3.20",
|
| 82 |
+
"langchain-text-splitters==0.3.9",
|
| 83 |
+
"langextract>=1.0.8",
|
| 84 |
+
"langsmith==0.4.13",
|
| 85 |
+
"lark==1.1.9",
|
| 86 |
+
"looseversion==1.3.0",
|
| 87 |
+
"lxml==6.0.0",
|
| 88 |
+
"markdown-it-py==4.0.0",
|
| 89 |
+
"marshmallow==3.26.1",
|
| 90 |
+
"matplotlib-inline==0.1.7",
|
| 91 |
+
"mdurl==0.1.2",
|
| 92 |
+
"msoffcrypto-tool==5.4.2",
|
| 93 |
+
"multidict==6.6.4",
|
| 94 |
+
"mypy-extensions==1.1.0",
|
| 95 |
+
"nest-asyncio==1.6.0",
|
| 96 |
+
"networkx==3.5",
|
| 97 |
+
"nibabel==5.3.2",
|
| 98 |
+
"nipype==1.10.0",
|
| 99 |
+
"numpy==2.3.2",
|
| 100 |
+
"olefile==0.47",
|
| 101 |
+
"oletools==0.60.2",
|
| 102 |
+
"openai==1.99.7",
|
| 103 |
+
"orjson==3.11.1",
|
| 104 |
+
"packaging==24.2",
|
| 105 |
+
"pandas==2.3.1",
|
| 106 |
+
"parso==0.8.4",
|
| 107 |
+
"pathlib==1.0.1",
|
| 108 |
+
"pcodedmp==1.2.6",
|
| 109 |
+
"pillow==11.3.0",
|
| 110 |
+
"pinecone==7.3.0",
|
| 111 |
+
"pinecone-plugin-assistant==1.7.0",
|
| 112 |
+
"pinecone-plugin-interface==0.0.7",
|
| 113 |
+
"platformdirs==4.3.8",
|
| 114 |
+
"pluggy==1.6.0",
|
| 115 |
+
"prompt-toolkit==3.0.51",
|
| 116 |
+
"propcache==0.3.2",
|
| 117 |
+
"proto-plus==1.26.1",
|
| 118 |
+
"protobuf==6.31.1",
|
| 119 |
+
"prov==2.1.1",
|
| 120 |
+
"psutil==7.0.0",
|
| 121 |
+
"pure-eval==0.2.3",
|
| 122 |
+
"puremagic==1.30",
|
| 123 |
+
"py-cpuinfo==9.0.0",
|
| 124 |
+
"pyasn1==0.6.1",
|
| 125 |
+
"pyasn1-modules==0.4.2",
|
| 126 |
+
"pycparser==2.22",
|
| 127 |
+
"pydantic==2.11.7",
|
| 128 |
+
"pydantic-core==2.33.2",
|
| 129 |
+
"pydantic-settings==2.10.1",
|
| 130 |
+
"pydot==4.0.1",
|
| 131 |
+
"pygments==2.19.2",
|
| 132 |
+
"pymupdf==1.26.3",
|
| 133 |
+
"pyparsing==3.2.3",
|
| 134 |
+
"pypdf>=6.0.0",
|
| 135 |
+
"pytest==8.4.1",
|
| 136 |
+
"pytest-asyncio==0.26.0",
|
| 137 |
+
"pytest-benchmark==5.1.0",
|
| 138 |
+
"pytest-codspeed==4.0.0",
|
| 139 |
+
"pytest-recording==0.13.4",
|
| 140 |
+
"pytest-socket==0.7.0",
|
| 141 |
+
"python-dateutil==2.9.0.post0",
|
| 142 |
+
"python-docx>=1.2.0",
|
| 143 |
+
"python-dotenv==1.1.1",
|
| 144 |
+
"python-magic>=0.4.27",
|
| 145 |
+
"python-magic-bin>=0.4.14",
|
| 146 |
+
"python-multipart>=0.0.20",
|
| 147 |
+
"pytz==2025.2",
|
| 148 |
+
"pyxnat==1.6.3",
|
| 149 |
+
"pyyaml==6.0.2",
|
| 150 |
+
"pyzmq==27.0.1",
|
| 151 |
+
"rdflib==7.1.4",
|
| 152 |
+
"red-black-tree-mod==1.22",
|
| 153 |
+
"regex==2025.7.34",
|
| 154 |
+
"requests==2.32.4",
|
| 155 |
+
"requests-toolbelt==1.0.0",
|
| 156 |
+
"rich==14.1.0",
|
| 157 |
+
"rsa==4.9.1",
|
| 158 |
+
"rtfde==0.1.2.1",
|
| 159 |
+
"scikit-learn==1.7.1",
|
| 160 |
+
"scipy==1.16.1",
|
| 161 |
+
"sentence-transformers>=5.1.0",
|
| 162 |
+
"simplejson==3.20.1",
|
| 163 |
+
"six==1.17.0",
|
| 164 |
+
"sniffio==1.3.1",
|
| 165 |
+
"soupsieve==2.7",
|
| 166 |
+
"sqlalchemy==2.0.42",
|
| 167 |
+
"stack-data==0.6.3",
|
| 168 |
+
"starlette==0.47.2",
|
| 169 |
+
"streamlit>=1.49.1",
|
| 170 |
+
"syrupy==4.9.1",
|
| 171 |
+
"tenacity==9.1.2",
|
| 172 |
+
"threadpoolctl==3.6.0",
|
| 173 |
+
"tiktoken==0.11.0",
|
| 174 |
+
"tornado==6.5.2",
|
| 175 |
+
"tqdm==4.67.1",
|
| 176 |
+
"traitlets==5.14.3",
|
| 177 |
+
"traits==7.0.2",
|
| 178 |
+
"typing-extensions==4.14.1",
|
| 179 |
+
"typing-inspect==0.9.0",
|
| 180 |
+
"typing-inspection==0.4.1",
|
| 181 |
+
"tzdata==2025.2",
|
| 182 |
+
"tzlocal==5.3.1",
|
| 183 |
+
"urllib3<2",
|
| 184 |
+
"uvicorn>=0.35.0",
|
| 185 |
+
"vcrpy==7.0.0",
|
| 186 |
+
"wcwidth==0.2.13",
|
| 187 |
+
"win-unicode-console==0.5",
|
| 188 |
+
"wrapt==1.17.2",
|
| 189 |
+
"yarl==1.20.1",
|
| 190 |
+
"zstandard==0.23.0",
|
| 191 |
+
]
|