Kshitijk20 commited on
Commit
e5b884f
·
1 Parent(s): f60655e

code push

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +17 -0
  2. .python-version +1 -0
  3. Best.md +2 -0
  4. Dockerfile +30 -0
  5. app/__init__.py +1 -0
  6. app/api/__init__.py +0 -0
  7. app/api/deps.py +0 -0
  8. app/api/v1/__init__.py +0 -0
  9. app/api/v1/routes.py +255 -0
  10. app/config/__init__.py +1 -0
  11. app/config/config.py +37 -0
  12. app/config/config.yaml +20 -0
  13. app/core/__init__.py +0 -0
  14. app/core/session_manager.py +198 -0
  15. app/data/__init__.py +0 -0
  16. app/database/database.py +177 -0
  17. app/database/sessions.db +0 -0
  18. app/embedding/__init__.py +1 -0
  19. app/embedding/embeder.py +12 -0
  20. app/embedding/vectore_store.py +40 -0
  21. app/ingestion/__init__.py +1 -0
  22. app/ingestion/file_loader.py +80 -0
  23. app/ingestion/text_splitter.py +99 -0
  24. app/main.py +0 -0
  25. app/metadata_extraction/__init__.py +0 -0
  26. app/metadata_extraction/metadata_ext.py +114 -0
  27. app/prompts/__init__.py +0 -0
  28. app/prompts/prompts.py +7 -0
  29. app/reseasoning/__init__.py +1 -0
  30. app/reseasoning/descision_maker.py +55 -0
  31. app/reseasoning/query_parser.py +18 -0
  32. app/retrieval/__init__.py +1 -0
  33. app/retrieval/reranker.py +0 -0
  34. app/retrieval/retriever.py +55 -0
  35. app/schemas/__init__.py +1 -0
  36. app/schemas/metadata_schema.py +75 -0
  37. app/schemas/request_models.py +77 -0
  38. app/schemas/response_models.py +32 -0
  39. app/services/RAG_service.py +165 -0
  40. app/services/__init__.py +0 -0
  41. app/utils/__init__.py +1 -0
  42. app/utils/config_loader.py +8 -0
  43. app/utils/document_op.py +14 -0
  44. app/utils/embedding_manager.py +0 -0
  45. app/utils/logger.py +0 -0
  46. app/utils/metadata_utils.py +51 -0
  47. app/utils/model_loader.py +83 -0
  48. experiments.ipynb +0 -0
  49. main.py +76 -0
  50. pyproject.toml +191 -0
.gitignore ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ .env
12
+ env
13
+ .env/
14
+ hackrx_rag_app
15
+ hackrx_rag_app/
16
+ .hackrx_rag_app
17
+ hackrx_rag_app/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Best.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ## Text splitting techniques
2
+ 1.
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Copy requirements first for better caching
7
+ COPY requirements.txt .
8
+
9
+ # Install Python dependencies
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy the entire application
13
+ COPY . .
14
+
15
+ # Create necessary directories
16
+ RUN mkdir -p app/uploads app/data
17
+
18
+ # Set environment variables
19
+ ENV PYTHONPATH=/app
20
+ ENV PORT=7860
21
+
22
+ # Expose the port that Hugging Face Spaces expects
23
+ EXPOSE 7860
24
+
25
+ # Health check
26
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
27
+ CMD curl -f http://localhost:7860/health || exit 1
28
+
29
+ # Command to run the FastAPI application
30
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/api/__init__.py ADDED
File without changes
app/api/deps.py ADDED
File without changes
app/api/v1/__init__.py ADDED
File without changes
app/api/v1/routes.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form
2
+ from fastapi.responses import JSONResponse
3
+ from typing import Optional
4
+ import os
5
+ import tempfile
6
+ from pathlib import Path
7
+ from app.core.session_manager import SessionManager, Session, session_manager
8
+ from app.services.RAG_service import RAGService
9
+ from app.schemas.request_models import QueryRequest
10
+ from app.schemas.response_models import SessionResponse, QueryResponse, UploadResponse, SourceDocument
11
+ from app.config.config import get_settings
12
+
13
+ router = APIRouter()
14
+
15
+ def get_session(session_id:str) -> Session:
16
+ """Dependency to get and validare session"""
17
+ print(f"[DEBUG] Looking for session: {session_id}")
18
+ print(f"[DEBUG] Available sessions: {list(session_manager.sessions.keys())}")
19
+ session = session_manager.get_session(session_id=session_id)
20
+ if not session:
21
+ print(f"[DEBUG] Session not found: {session_id}")
22
+ raise HTTPException(status_code=404, detail="Session not found or expired"
23
+ )
24
+ print(f"[DEBUG] Session found: {session_id}")
25
+ return session
26
+
27
+ @router.post("/session", response_model=SessionResponse)
28
+ async def create_session(username: str = "anonymous"):
29
+ """Create a new session for document processing"""
30
+ session_id = session_manager.create_session(username)
31
+ print(f"[DEBUG] Created session: {session_id} for user: {username}")
32
+ print(f"[DEBUG] Total sessions now: {len(session_manager.sessions)}")
33
+ return SessionResponse(
34
+ session_id=session_id,
35
+ message="Session created successfully"
36
+ )
37
+
38
+ @router.get("/sessions/{username}")
39
+ async def get_user_sessions(username: str):
40
+ """Get all sessions for a user"""
41
+ sessions = session_manager.get_user_sessions(username)
42
+ return {"sessions": sessions}
43
+
44
+ @router.post("/session/{session_id}/restore")
45
+ async def restore_session(session_id: str):
46
+ """Restore a session from database"""
47
+ success = session_manager.restore_session(session_id)
48
+ if success:
49
+ return {"message": "Session restored successfully"}
50
+ else:
51
+ raise HTTPException(status_code=404, detail="Session not found or inactive")
52
+
53
+ @router.delete("/session/{session_id}")
54
+ async def delete_session(session_id: str):
55
+ """Delete a session"""
56
+ session_manager.delete_session(session_id)
57
+ return {"message": "Session deleted successfully"}
58
+
59
+ @router.post("/upload/{session_id}", response_model=UploadResponse)
60
+ async def upload_document(
61
+ session_id: str,
62
+ file: UploadFile = File(None),
63
+ url: str = Form(None),
64
+ doc_type: str = Form(None),
65
+ session: Session = Depends(get_session)
66
+ ):
67
+ """Upload and process a document from file or URL"""
68
+ settings = get_settings()
69
+
70
+ # Validate input - either file or URL must be provided
71
+ if not file and not url:
72
+ raise HTTPException(status_code=400, detail="Either file or URL must be provided")
73
+
74
+ if file and url:
75
+ raise HTTPException(status_code=400, detail="Provide either file or URL, not both")
76
+
77
+ try:
78
+ # Initialize RAG service for this session
79
+ session.rag_service = RAGService()
80
+
81
+ document_name = ""
82
+ document_path = None
83
+ document_url = None
84
+
85
+ if file:
86
+ # Handle file upload
87
+ file_extension = Path(file.filename).suffix.lower()
88
+ if file_extension not in settings.allowed_file_types:
89
+ raise HTTPException(
90
+ status_code=400,
91
+ detail=f"File type {file_extension} not allowed. Allowed types: {settings.allowed_file_types}"
92
+ )
93
+
94
+ # Auto-detect document type if not provided
95
+ if not doc_type:
96
+ doc_type = "pdf" if file_extension == ".pdf" else "word"
97
+
98
+ # Validate file size
99
+ content = await file.read()
100
+ if len(content) > settings.max_file_size:
101
+ raise HTTPException(
102
+ status_code=400,
103
+ detail=f"File size too large. Maximum size: {settings.max_file_size} bytes"
104
+ )
105
+
106
+ # Create upload directory
107
+ upload_dir = Path(settings.upload_dir)
108
+ upload_dir.mkdir(exist_ok=True)
109
+
110
+ # Save file temporarily
111
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file:
112
+ tmp_file.write(content)
113
+ tmp_file_path = tmp_file.name
114
+
115
+ document_name = file.filename
116
+ document_path = tmp_file_path
117
+
118
+ # Load and split document from file
119
+ session.rag_service.load_and_split_document(
120
+ type=doc_type,
121
+ path=tmp_file_path
122
+ )
123
+
124
+ else:
125
+ # Handle URL upload
126
+ if not doc_type:
127
+ doc_type = "pdf" # Default for URLs
128
+
129
+ document_name = url.split('/')[-1] or "URL Document"
130
+ document_url = url
131
+
132
+ # Load and split document from URL
133
+ session.rag_service.load_and_split_document(
134
+ type=doc_type,
135
+ url=url
136
+ )
137
+
138
+ # Create vector store
139
+ session.rag_service.create_vector_store()
140
+
141
+ # Update session state
142
+ session.document_uploaded = True
143
+ session.vector_store_created = True
144
+ session.document_info = {
145
+ "filename": document_name,
146
+ "type": doc_type,
147
+ "size": len(content) if file else 0,
148
+ "chunks_count": len(session.rag_service.chunks)
149
+ }
150
+
151
+ # Update session in database
152
+ session_manager.update_session_document(
153
+ session_id=session_id,
154
+ document_name=document_name,
155
+ document_type=doc_type,
156
+ chunks_count=len(session.rag_service.chunks),
157
+ pinecone_index=str(session.rag_service.index),
158
+ pinecone_namespace=session.rag_service.namespace,
159
+ document_path=document_path,
160
+ document_url=document_url
161
+ )
162
+
163
+ # Clean up temporary file if it exists
164
+ if document_path:
165
+ try:
166
+ os.unlink(document_path)
167
+ except:
168
+ pass
169
+
170
+ return UploadResponse(
171
+ session_id=session_id,
172
+ filename=document_name,
173
+ document_type=doc_type,
174
+ chunks_created=len(session.rag_service.chunks),
175
+ message="Document uploaded and processed successfully"
176
+ )
177
+
178
+ except Exception as e:
179
+ # Clean up temporary file if it exists
180
+ if 'document_path' in locals() and document_path:
181
+ try:
182
+ os.unlink(document_path)
183
+ except:
184
+ pass
185
+ raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
186
+
187
+ @router.post("/query/{session_id}", response_model = QueryResponse)
188
+ async def query_document(
189
+ session_id: str,
190
+ query_request: QueryRequest,
191
+ session: Session = Depends(get_session)
192
+ ):
193
+ """Query the uploaded Document"""
194
+ if not session.document_uploaded or not session.vector_store_created:
195
+ raise HTTPException(
196
+ status_code= 400,
197
+ detail="No docuement uploaded or processed for this session"
198
+ )
199
+ try:
200
+ # create query embedding
201
+ session.rag_service.create_query_embedding(query_request.query)
202
+
203
+ # retrive relevant docs
204
+ session.rag_service.retrive_documents()
205
+
206
+ # generate answer
207
+ answer = session.rag_service.answer_query(query_request.query)
208
+
209
+ # Extract query metadata and sources for UI
210
+ query_metadata = getattr(session.rag_service, 'query_metadata', {})
211
+
212
+ # Extract source documents from results
213
+ sources = []
214
+ if hasattr(session.rag_service, 'result') and session.rag_service.result:
215
+ matches = session.rag_service.result.get('matches', [])
216
+ for match in matches[:3]: # Top 3 sources
217
+ metadata = match.get('metadata', {})
218
+ sources.append(SourceDocument(
219
+ doc_id=metadata.get('doc_id', match.get('id', '')),
220
+ page=metadata.get('page_no', metadata.get('page', 0)),
221
+ text=metadata.get('text', ''),
222
+ score=match.get('score', 0.0),
223
+ metadata=metadata
224
+ ))
225
+
226
+ return QueryResponse(
227
+ session_id=session_id,
228
+ query=query_request.query,
229
+ answer=answer,
230
+ query_metadata=query_metadata,
231
+ sources=sources,
232
+ message="Query processed successfully"
233
+ )
234
+
235
+ except Exception as e:
236
+ raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
237
+
238
+ @router.get("/session/{session_id}/status")
239
+ async def get_session_status(
240
+ session_id: str,
241
+ session: Session = Depends(get_session)
242
+ ):
243
+ """Get session status and information"""
244
+ return {
245
+ "session_id": session_id,
246
+ "created_at": session.created_at,
247
+ "last_activity": session.last_activity,
248
+ "document_uploaded": session.document_uploaded,
249
+ "vector_store_created": session.vector_store_created,
250
+ "document_info": session.document_info
251
+ }
252
+
253
+
254
+
255
+
app/config/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/config/config.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from typing import Optional
3
+ import os
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+ class Settings(BaseSettings):
7
+ # API Settings
8
+ api_title: str = "RAG Document Analysis API"
9
+ api_version: str = "1.0.0"
10
+
11
+ # File Upload Settings
12
+ max_file_size: int = 50 * 1024 * 1024 # 50MB
13
+ allowed_file_types: list = [".pdf", ".docx", ".doc"]
14
+ upload_dir: str = "app/uploads"
15
+
16
+ # Session Settings
17
+ session_timeout_minutes: int = 60
18
+
19
+ database_path: str = os.getenv("DATABASE_PATH", "/tmp/claridoc_data/sessions.db")
20
+
21
+
22
+ # API Keys
23
+ gemini_api_key: Optional[str] = None
24
+ pinecone_api_key: Optional[str] = None
25
+ openai_api_key: Optional[str] = None
26
+ groq_api_key: Optional[str] = None
27
+ hf_token: Optional[str] = None
28
+
29
+ # Environment
30
+ environment: str = "development"
31
+ debug: bool = True
32
+
33
+ class Config:
34
+ env_file = ".env"
35
+
36
+ def get_settings() -> Settings:
37
+ return Settings()
app/config/config.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ llm:
2
+ groq:
3
+ provider: "groq"
4
+ model_name: "openai/gpt-oss-20b"
5
+ gemini:
6
+ provider: "gemini"
7
+ model_name: "gemini-2.5-flash"
8
+ gemini_lite:
9
+ provider: "gemini_lite"
10
+ model_name: "gemini-2.5-flash-lite"
11
+
12
+ embedding_model:
13
+ openai:
14
+ provider: "openai"
15
+ model_name: "text-embedding-3-small"
16
+ huggingface:
17
+ provider: "huggingface"
18
+ model_name: "mixedbread-ai/mxbai-embed-large-v1"
19
+
20
+
app/core/__init__.py ADDED
File without changes
app/core/session_manager.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from typing import Dict, Optional
3
+ from datetime import datetime, timedelta
4
+ from app.services.RAG_service import RAGService
5
+ from app.database.database import SessionDatabase
6
+ from app.schemas.request_models import DocumentTypeSchema
7
+
8
+ class Session:
9
+ def __init__(self, session_id:str):
10
+ self.session_id = session_id
11
+ self.created_at = datetime.now()
12
+ self.last_activity = datetime.now()
13
+ self.rag_service : Optional[RAGService] = None
14
+ self.document_uploaded = False
15
+ self.vector_store_created = False
16
+ self.document_info = {}
17
+ self.username = None
18
+
19
+ def update_activity(self):
20
+ self.last_activity = datetime.now()
21
+
22
+ def is_expired(self, timeout_minutes: int = 60) -> bool:
23
+ return datetime.now() - self.last_activity > timedelta(minutes=timeout_minutes)
24
+
25
+ class SessionManager:
26
+ def __init__(self):
27
+ self.sessions: Dict[str, Session] = {}
28
+ self.db = SessionDatabase()
29
+
30
+ def create_session(self, username: str = "anonymous") -> str:
31
+ session_id = str(uuid.uuid4())
32
+ session = Session(session_id)
33
+ session.username = username
34
+ self.sessions[session_id] = session
35
+
36
+ # Save to database
37
+ self.db.create_session(session_id, username)
38
+ return session_id
39
+
40
+ def get_session(self, session_id: str) -> Optional[Session]:
41
+ # First check in-memory sessions
42
+ if session_id in self.sessions:
43
+ session = self.sessions[session_id]
44
+ if not session.is_expired():
45
+ session.update_activity()
46
+ return session
47
+ else:
48
+ del self.sessions[session_id]
49
+
50
+ # If not in memory, try to restore from database
51
+ db_session = self.db.get_session(session_id)
52
+ if db_session and db_session['is_active']:
53
+ # Restore session to memory
54
+ session = Session(session_id)
55
+ session.username = db_session['username']
56
+ session.document_uploaded = db_session['chunks_count'] > 0
57
+ session.vector_store_created = db_session['pinecone_index'] is not None
58
+ session.document_info = {
59
+ 'filename': db_session['document_name'],
60
+ 'type': db_session['document_type'],
61
+ 'chunks_count': db_session['chunks_count']
62
+ }
63
+
64
+ # Initialize RAG service if document exists (same as restore_session)
65
+ if session.vector_store_created:
66
+ print(f"[SessionManager] Restoring RAG service for session {session_id}")
67
+ session.rag_service = RAGService()
68
+
69
+ # Set the basic attributes
70
+ session.rag_service.index = db_session['pinecone_index']
71
+ session.rag_service.namespace = db_session['pinecone_namespace']
72
+ session.rag_service.Document_path = db_session['document_path']
73
+ session.rag_service.url = db_session['document_url']
74
+
75
+ # Create a mock splitter with the keywords file path for restored sessions
76
+ from app.ingestion.text_splitter import splitting_text
77
+ from app.utils.metadata_utils import MetadataService
78
+
79
+ # Recreate the document type information
80
+ metadataservice = MetadataService()
81
+ mock_scheme = DocumentTypeSchema(document_types="Insurance") # Default for restored sessions
82
+ document_type = metadataservice.Return_document_model(mock_scheme)
83
+ session.rag_service.DocumentTypeScheme = mock_scheme
84
+ session.rag_service.Document_Type = document_type
85
+
86
+ # Create splitter instance to maintain the keywords file path
87
+ session.rag_service.splitter = splitting_text(documentTypeSchema=document_type, llm=session.rag_service.llm)
88
+
89
+ # Generate the expected keywords file path based on document name
90
+ import os
91
+ document_name = db_session['document_name'] or 'unknown'
92
+ keywords_filename = document_name.replace(".", "").replace("\\", "").replace("/", "") + ".json"
93
+ session.rag_service.splitter.Keywordsfile_path = os.path.join("app/data/", keywords_filename)
94
+
95
+ print(f"[SessionManager] RAG service restored with index: {session.rag_service.index}")
96
+
97
+ self.sessions[session_id] = session
98
+ return session
99
+
100
+ return None
101
+
102
+ def update_session_document(self, session_id: str, document_name: str,
103
+ document_type: str, chunks_count: int,
104
+ pinecone_index: str = None, pinecone_namespace: str = None,
105
+ document_path: str = None, document_url: str = None):
106
+ """Update session with document information"""
107
+ self.db.update_session(
108
+ session_id,
109
+ document_name=document_name,
110
+ document_type=document_type,
111
+ chunks_count=chunks_count,
112
+ pinecone_index=pinecone_index,
113
+ pinecone_namespace=pinecone_namespace,
114
+ document_path=document_path,
115
+ document_url=document_url
116
+ )
117
+
118
+ # Update in-memory session if exists
119
+ if session_id in self.sessions:
120
+ session = self.sessions[session_id]
121
+ session.document_uploaded = True
122
+ session.vector_store_created = pinecone_index is not None
123
+ session.document_info = {
124
+ 'filename': document_name,
125
+ 'type': document_type,
126
+ 'chunks_count': chunks_count
127
+ }
128
+
129
+ def get_user_sessions(self, username: str):
130
+ """Get all sessions for a user"""
131
+ return self.db.get_user_sessions(username)
132
+
133
+ def restore_session(self, session_id: str) -> bool:
134
+ """Restore a session from database"""
135
+ db_session = self.db.get_session(session_id)
136
+ if db_session and db_session['is_active']:
137
+ session = Session(session_id)
138
+ session.username = db_session['username']
139
+ session.document_uploaded = db_session['chunks_count'] > 0
140
+ session.vector_store_created = db_session['pinecone_index'] is not None
141
+ session.document_info = {
142
+ 'filename': db_session['document_name'],
143
+ 'type': db_session['document_type'],
144
+ 'chunks_count': db_session['chunks_count']
145
+ }
146
+
147
+ # Initialize RAG service if document exists
148
+ if session.vector_store_created:
149
+ print(f"[SessionManager] Restoring RAG service for session {session_id}")
150
+ session.rag_service = RAGService()
151
+
152
+ # Set the basic attributes
153
+ session.rag_service.index = db_session['pinecone_index']
154
+ session.rag_service.namespace = db_session['pinecone_namespace']
155
+ session.rag_service.Document_path = db_session['document_path']
156
+ session.rag_service.url = db_session['document_url']
157
+
158
+ # Create a mock splitter with the keywords file path for restored sessions
159
+ from app.ingestion.text_splitter import splitting_text
160
+ from app.utils.metadata_utils import MetadataService
161
+
162
+ # Recreate the document type information
163
+ metadataservice = MetadataService()
164
+ mock_scheme = DocumentTypeSchema(document_types="Insurance") # Default for restored sessions
165
+ document_type = metadataservice.Return_document_model(mock_scheme)
166
+ session.rag_service.DocumentTypeScheme = mock_scheme
167
+ session.rag_service.Document_Type = document_type
168
+
169
+ # Create splitter instance to maintain the keywords file path
170
+ session.rag_service.splitter = splitting_text(documentTypeSchema=document_type, llm=session.rag_service.llm)
171
+
172
+ # Generate the expected keywords file path based on document name
173
+ import os
174
+ document_name = db_session['document_name'] or 'unknown'
175
+ keywords_filename = document_name.replace(".", "").replace("\\", "").replace("/", "") + ".json"
176
+ session.rag_service.splitter.Keywordsfile_path = os.path.join("app/data/", keywords_filename)
177
+
178
+ print(f"[SessionManager] RAG service restored with index: {session.rag_service.index}")
179
+
180
+ self.sessions[session_id] = session
181
+ return True
182
+ return False
183
+
184
+ def delete_session(self, session_id:str):
185
+ if session_id in self.sessions:
186
+ del self.sessions[session_id]
187
+ self.db.deactivate_session(session_id)
188
+
189
+ def cleanup_expired_sessions(self):
190
+ expired_sessions = [
191
+ sid for sid, session in self.sessions.items()
192
+ if session.is_expired()
193
+ ]
194
+ for sid in expired_sessions:
195
+ del self.sessions[sid]
196
+
197
+ session_manager = SessionManager()
198
+
app/data/__init__.py ADDED
File without changes
app/database/database.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import json
3
+ from datetime import datetime
4
+ from typing import List, Dict, Optional
5
+ import os
6
+ from pathlib import Path
7
+ from app.config.config import get_settings
8
+
9
+ class SessionDatabase:
10
+ def __init__(self):
11
+ settings = get_settings()
12
+ self.db_path = settings.database_path
13
+
14
+ # Ensure the directory exists and is writable
15
+ db_dir = Path(self.db_path).parent
16
+ try:
17
+ db_dir.mkdir(parents=True, exist_ok=True)
18
+ # Test if directory is writable
19
+ test_file = db_dir / "test_write.tmp"
20
+ test_file.touch()
21
+ test_file.unlink()
22
+ except Exception as e:
23
+ # Fallback to a different writable location
24
+ import tempfile
25
+ fallback_dir = Path(tempfile.gettempdir()) / "claridoc"
26
+ fallback_dir.mkdir(exist_ok=True)
27
+ self.db_path = str(fallback_dir / "sessions.db")
28
+ print(f"Database path not writable, using fallback: {self.db_path}")
29
+
30
+ self.init_db()
31
+
32
+ def init_db(self):
33
+ """Initialize the database with required tables"""
34
+ try:
35
+ with sqlite3.connect(self.db_path) as conn:
36
+ conn.execute("""
37
+ CREATE TABLE IF NOT EXISTS sessions (
38
+ session_id TEXT PRIMARY KEY,
39
+ username TEXT NOT NULL,
40
+ created_at TEXT NOT NULL,
41
+ last_accessed TEXT NOT NULL,
42
+ document_uploaded BOOLEAN DEFAULT FALSE,
43
+ vector_store_created BOOLEAN DEFAULT FALSE,
44
+ document_info TEXT DEFAULT '{}',
45
+ rag_service_data TEXT DEFAULT '{}'
46
+ )
47
+ """)
48
+
49
+ conn.execute("""
50
+ CREATE TABLE IF NOT EXISTS users (
51
+ username TEXT PRIMARY KEY,
52
+ created_at TEXT NOT NULL,
53
+ last_login TEXT NOT NULL
54
+ )
55
+ """)
56
+ conn.commit()
57
+ print(f"Database initialized successfully at: {self.db_path}")
58
+ except Exception as e:
59
+ print(f"Failed to initialize database: {e}")
60
+ raise
61
+
62
+ def create_session(self, session_id: str, username: str):
63
+ """Create a new session in the database"""
64
+ try:
65
+ with sqlite3.connect(self.db_path) as conn:
66
+ now = datetime.now().isoformat()
67
+
68
+ # Insert or update user
69
+ conn.execute("""
70
+ INSERT OR REPLACE INTO users (username, created_at, last_login)
71
+ VALUES (?, COALESCE((SELECT created_at FROM users WHERE username = ?), ?), ?)
72
+ """, (username, username, now, now))
73
+
74
+ # Insert session
75
+ conn.execute("""
76
+ INSERT INTO sessions (session_id, username, created_at, last_accessed)
77
+ VALUES (?, ?, ?, ?)
78
+ """, (session_id, username, now, now))
79
+
80
+ conn.commit()
81
+ except Exception as e:
82
+ print(f"Failed to create session: {e}")
83
+ raise
84
+
85
+ def get_user_sessions(self, username: str) -> List[Dict]:
86
+ """Get all sessions for a user"""
87
+ try:
88
+ with sqlite3.connect(self.db_path) as conn:
89
+ conn.row_factory = sqlite3.Row
90
+ cursor = conn.execute("""
91
+ SELECT * FROM sessions
92
+ WHERE username = ?
93
+ ORDER BY last_accessed DESC
94
+ """, (username,))
95
+
96
+ sessions = []
97
+ for row in cursor.fetchall():
98
+ document_info = json.loads(row['document_info'] or '{}')
99
+ sessions.append({
100
+ 'session_id': row['session_id'],
101
+ 'created_at': row['created_at'],
102
+ 'last_accessed': row['last_accessed'],
103
+ 'document_uploaded': bool(row['document_uploaded']),
104
+ 'vector_store_created': bool(row['vector_store_created']),
105
+ 'document_name': document_info.get('filename', 'Untitled Document'),
106
+ 'document_type': document_info.get('type', 'Unknown'),
107
+ 'chunks_count': document_info.get('chunks_count', 0)
108
+ })
109
+
110
+ return sessions
111
+ except Exception as e:
112
+ print(f"Failed to get user sessions: {e}")
113
+ return []
114
+
115
+ def get_session(self, session_id: str) -> Optional[Dict]:
116
+ """Get session data by ID"""
117
+ try:
118
+ with sqlite3.connect(self.db_path) as conn:
119
+ conn.row_factory = sqlite3.Row
120
+ cursor = conn.execute("""
121
+ SELECT * FROM sessions WHERE session_id = ?
122
+ """, (session_id,))
123
+
124
+ row = cursor.fetchone()
125
+ if row:
126
+ return {
127
+ 'session_id': row['session_id'],
128
+ 'username': row['username'],
129
+ 'created_at': row['created_at'],
130
+ 'last_accessed': row['last_accessed'],
131
+ 'document_uploaded': bool(row['document_uploaded']),
132
+ 'vector_store_created': bool(row['vector_store_created']),
133
+ 'document_info': json.loads(row['document_info'] or '{}'),
134
+ 'rag_service_data': json.loads(row['rag_service_data'] or '{}')
135
+ }
136
+ return None
137
+ except Exception as e:
138
+ print(f"Failed to get session: {e}")
139
+ return None
140
+
141
+ def update_session(self, session_id: str, **kwargs):
142
+ """Update session data"""
143
+ try:
144
+ with sqlite3.connect(self.db_path) as conn:
145
+ # Always update last_accessed
146
+ kwargs['last_accessed'] = datetime.now().isoformat()
147
+
148
+ # Convert dictionaries to JSON strings
149
+ if 'document_info' in kwargs:
150
+ kwargs['document_info'] = json.dumps(kwargs['document_info'])
151
+ if 'rag_service_data' in kwargs:
152
+ kwargs['rag_service_data'] = json.dumps(kwargs['rag_service_data'])
153
+
154
+ # Build dynamic query
155
+ set_clause = ", ".join(f"{key} = ?" for key in kwargs.keys())
156
+ values = list(kwargs.values()) + [session_id]
157
+
158
+ conn.execute(f"""
159
+ UPDATE sessions
160
+ SET {set_clause}
161
+ WHERE session_id = ?
162
+ """, values)
163
+
164
+ conn.commit()
165
+ except Exception as e:
166
+ print(f"Failed to update session: {e}")
167
+ raise
168
+
169
+ def delete_session(self, session_id: str):
170
+ """Delete a session"""
171
+ try:
172
+ with sqlite3.connect(self.db_path) as conn:
173
+ conn.execute("DELETE FROM sessions WHERE session_id = ?", (session_id,))
174
+ conn.commit()
175
+ except Exception as e:
176
+ print(f"Failed to delete session: {e}")
177
+ raise
app/database/sessions.db ADDED
Binary file (32.8 kB). View file
 
app/embedding/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/embedding/embeder.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from typing import Union, List
4
+
5
+ class QueryEmbedding:
6
+ def __init__(self, query: str, embedding_model):
7
+ self.query = query
8
+ self.embedding_model = embedding_model
9
+
10
+ def get_embedding(self):
11
+ e_main = self.embedding_model.embed_query(self.query)
12
+ return e_main
app/embedding/vectore_store.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from pinecone import Pinecone
4
+ from pinecone import ServerlessSpec
5
+ from langchain_pinecone import PineconeVectorStore
6
+ from datetime import datetime
7
+ from uuid import uuid4
8
+
9
+ class VectorStore:
10
+ def __init__(self, text_chunks, embedding_model):
11
+ self.text_chunks = text_chunks
12
+ self.current_time = datetime.now()
13
+ self.embedding_model = embedding_model
14
+
15
+ def create_vectorestore(self):
16
+ load_dotenv()
17
+ pinecone_key = os.getenv("PINECONE_API_KEY")
18
+ pc = Pinecone(api_key=pinecone_key)
19
+ # pc._vector_api.api_client.pool_threads = 1
20
+ time_string = self.current_time.strftime("%Y-%m-%d-%H-%M")
21
+ index_name = f"rag-project"
22
+ if not pc.has_index(index_name):
23
+ pc.create_index(
24
+ name = index_name,
25
+ dimension=1536,
26
+ metric="cosine",
27
+ spec = ServerlessSpec(cloud="aws", region="us-east-1")
28
+ )
29
+
30
+ index = pc.Index(index_name)
31
+ # model_loader = ModelLoader(model_provider="openai")
32
+ # embedding_model = model_loader.load_llm()
33
+ uuids = [str(uuid4()) for _ in range(len(self.text_chunks)) ]
34
+ vector_store = PineconeVectorStore(index = index, embedding=self.embedding_model)
35
+ name_space = f"hackrx-index{time_string}"
36
+ vector_store.add_documents(documents=self.text_chunks, ids = uuids,namespace = name_space )
37
+
38
+ return index, name_space
39
+
40
+
app/ingestion/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/ingestion/file_loader.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from langchain_community.document_loaders import PyMuPDFLoader, Docx2txtLoader
3
+ import os
4
+ import tempfile
5
+ from app.schemas.request_models import DocumentTypeSchema
6
+ from langchain_core.documents import Document
7
+ from typing import List
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_core.output_parsers import PydanticOutputParser
10
+ from app.schemas.request_models import DocumentTypeSchema
11
+
12
+ class FileLoader:
13
+ def __init__(self, llm=None):
14
+ self.llm = llm
15
+
16
+ def detect_document_type(self, documents: List[Document]) -> DocumentTypeSchema:
17
+ """Detect the genre of document by reading first 2 page content by llm."""
18
+
19
+ document_content = " ".join([doc.page_content for doc in documents])
20
+ parser = PydanticOutputParser(pydantic_object=DocumentTypeSchema)
21
+ prompt = ChatPromptTemplate.from_messages([
22
+ ("system", "You are a legal/HR/financial document classifier."),
23
+ ("human", """
24
+ You will be given the first 2 pages of a document.
25
+ Classify it into one of the following categories:
26
+ - HR/Employment
27
+ - Insurance
28
+ - Legal/Compliance
29
+ - Financial/Regulatory
30
+ - Healthcare
31
+
32
+ Respond strictly in JSON that matches the schema.
33
+
34
+ {format_instructions}
35
+
36
+ Document content:
37
+ {document_content}
38
+ """),
39
+ ])
40
+ chain = prompt | self.llm | parser
41
+ result: DocumentTypeSchema = chain.invoke({
42
+ "document_content": document_content,
43
+ "format_instructions": parser.get_format_instructions()
44
+ })
45
+ return result
46
+
47
+ def load_documents_from_url(self, url: str) -> List[Document]:
48
+ response = requests.get(url)
49
+ response.raise_for_status()
50
+ content_type = response.headers.get('Content-Type', '')
51
+ if content_type == 'application/pdf':
52
+ tmp_file_path = self._save_temp_file(response.content, ".pdf")
53
+ return self.load_pdf(tmp_file_path)
54
+ else:
55
+ raise ValueError("File type not supported, expected a PDF.")
56
+
57
+ def load_pdf(self, path: str) -> List[Document]:
58
+ """Load PDF from a local path and return its content."""
59
+ self._validate_file_exists(path)
60
+ loader = PyMuPDFLoader(path)
61
+ return loader.load()
62
+
63
+ def load_word_document(self, path: str) -> List[Document]:
64
+ """Load Word document from a local path and return its content."""
65
+ self._validate_file_exists(path)
66
+ try:
67
+ docx_loader = Docx2txtLoader(path)
68
+ return docx_loader.load()
69
+ except Exception as e:
70
+ print(e)
71
+ return []
72
+
73
+ def _save_temp_file(self, content: bytes, suffix: str) -> str:
74
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
75
+ tmp_file.write(content)
76
+ return tmp_file.name
77
+
78
+ def _validate_file_exists(self, path: str):
79
+ if not os.path.exists(path):
80
+ raise FileNotFoundError(f"The file {path} does not exist.")
app/ingestion/text_splitter.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from uuid import uuid4
4
+ from typing import List, Dict
5
+ import os
6
+ import json
7
+ from app.utils.metadata_utils import MetadataService
8
+ from app.metadata_extraction.metadata_ext import MetadataExtractor
9
+ from pydantic import BaseModel
10
+ from typing import Type
11
+ from app.utils.metadata_utils import MetadataService
12
+ class splitting_text:
13
+ def __init__(self, documentTypeSchema:Type[BaseModel], llm=None):
14
+ self.llm = llm
15
+ self.metadata_extractor = MetadataExtractor(llm = self.llm)
16
+ self.metadata_services = MetadataService()
17
+ self.documentTypeSchema = documentTypeSchema
18
+ self.Keywordsfile_path = None
19
+
20
+ def _clean_text(self, text:str)-> str:
21
+ """Clean extracted page content"""
22
+ # remove excessive whitespace
23
+ text = " ".join(text.split())
24
+ return text
25
+
26
+ def text_splitting(self, doc: List[Document]) -> List[Document]:
27
+ """Split document into chunks for processing"""
28
+
29
+ all_chunks = []
30
+ splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
31
+ for i, page in enumerate(doc):
32
+ # reset per page
33
+ try:
34
+ text = page.get_text()
35
+ except:
36
+ text = page.page_content
37
+ # print(type(page))
38
+
39
+ # text = self._clean_text(text)
40
+
41
+
42
+
43
+ if i == 0:
44
+ output_folder = "app/data/"
45
+ filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
46
+ output_path = os.path.join(output_folder, filename)
47
+ self.Keywordsfile_path = output_path
48
+ # First page → extract + create JSON
49
+ Document_metadata = self.metadata_extractor.extractMetadata(document=page, known_keywords={}, metadata_class=self.documentTypeSchema)
50
+ extracted = Document_metadata.model_dump()
51
+ normalized = MetadataService.normalize_dict_to_lists(metadata = extracted)
52
+
53
+ with open(output_path, "w") as f:
54
+ json.dump(normalized, f, indent=4)
55
+ known_keywords = normalized
56
+
57
+ else:
58
+ # Next pages → load JSON and enforce consistency
59
+ with open(self.Keywordsfile_path, "r") as f:
60
+ known_keywords = json.load(f)
61
+
62
+ Document_metadata = self.metadata_extractor.extractMetadata(document=page, known_keywords=known_keywords, metadata_class=self.documentTypeSchema)
63
+
64
+ # check if there is new keyword is added or not during metadata extraction if yes then normalise(convert to dict) and then add new values into the keys exist
65
+ if Document_metadata.added_new_keyword:
66
+ new_data = self.metadata_services.normalize_dict_to_lists(
67
+ Document_metadata.model_dump(exclude_none= True)
68
+ )
69
+ for key,vals in new_data.items():
70
+ if isinstance(vals,list):
71
+ known_keywords[key] = list(set(known_keywords.get(key,[]) + vals)) #get the existing key and add vals and convert into set then list and update the file.
72
+ with open(self.Keywordsfile_path, "w") as f:
73
+ json.dump(known_keywords, f, indent=4)
74
+
75
+ # print(f"Document_metadata type: {type(Document_metadata)}")
76
+ extracted_metadata = Document_metadata.model_dump(exclude_none=True)
77
+ # print(f"extracted_metadata type: {type(extracted_metadata)}")
78
+ print(f"doc number: {i}")
79
+
80
+
81
+ if text.strip():
82
+ uuid = str(uuid4())
83
+ temp_doc = Document(
84
+ page_content=text,
85
+ metadata={
86
+ **page.metadata,
87
+ **extracted_metadata,
88
+ "page_no": i,
89
+ "doc_id": uuid,
90
+ "chunk_id": f"{uuid}_p{i}",
91
+ "type": "text"
92
+ }
93
+ )
94
+ chunks = splitter.split_documents([temp_doc])
95
+ all_chunks.extend(chunks)
96
+
97
+ return all_chunks
98
+
99
+
app/main.py ADDED
File without changes
app/metadata_extraction/__init__.py ADDED
File without changes
app/metadata_extraction/metadata_ext.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from langchain_core.exceptions import OutputParserException
3
+ from langchain_core.documents import Document
4
+ from langchain_core.output_parsers import PydanticOutputParser
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from typing import Type
7
+ from pydantic import BaseModel
8
+ # wrap parser with fixer once
9
+ # pydantic_parser = PydanticOutputParser(pydantic_object=InsuranceMetadata)
10
+ # fixing_parser = OutputFixingParser.from_llm(llm=llm, parser=pydantic_parser)
11
+
12
+
13
+ class MetadataExtractor:
14
+ def __init__(self, llm = None):
15
+ self.llm = llm
16
+
17
+ def extractMetadata_query(self, metadata_class : Type[BaseModel],document: Document, known_keywords: dict) -> BaseModel:
18
+ parser = PydanticOutputParser(pydantic_object=metadata_class)
19
+
20
+ schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
21
+ keywords_str = json.dumps(known_keywords, indent=2)
22
+
23
+ prompt = ChatPromptTemplate.from_messages([
24
+ ("system", """You are an information extraction system.
25
+ Extract only the required metadata from the user query using the existing known keywords.
26
+
27
+ ⚠️ CRITICAL FORMATTING RULES:
28
+ - ALL fields must be arrays/lists, even if there's only one value
29
+ - For single values, wrap in brackets: "doc_id": ["single_value"]
30
+ - For multiple values: "coverage_type": ["value1", "value2", "value3"]
31
+ - For null/empty fields, use: null (not empty arrays)
32
+
33
+ ⚠️ Content Rules:
34
+ - For exclusions and obligations, DO NOT copy full sentences.
35
+ - Instead, extract only concise normalized keywords (2–5 words max each).
36
+ - Use existing keywords if they already exist in the provided list.
37
+ - Prefer to reuse existing keywords if they are semantically the same.
38
+ - If you find a new keyword that is a sub-type or more specific variant of an existing one, keep both:
39
+ reuse the closest match from existing keywords, and also add the new one.
40
+ - In that case, set added_new_keyword=true.
41
+ - Do not include raw paragraphs in the output.
42
+
43
+ Schema you must follow:
44
+ {schema}
45
+
46
+ Existing Keywords:
47
+ {keywords}
48
+ """),
49
+ ("human", "Text:\n{document_content}")
50
+ ])
51
+ chain = prompt | self.llm | parser
52
+
53
+ try:
54
+ result = chain.invoke({
55
+ "schema": schema_str,
56
+ "keywords": keywords_str,
57
+ "document_content": document.page_content
58
+ })
59
+ return result
60
+ except OutputParserException as e:
61
+ print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
62
+ return metadata_class(added_new_keyword=False)
63
+
64
+ def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict) -> BaseModel:
65
+ parser = PydanticOutputParser(pydantic_object=metadata_class)
66
+
67
+ schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
68
+ keywords_str = json.dumps(known_keywords, indent=2)
69
+
70
+ prompt = ChatPromptTemplate.from_messages([
71
+ ("system", """You are an information extraction system.
72
+ Extract only the required metadata from the text according to schema given below.
73
+
74
+ ⚠️ CRITICAL FORMATTING RULES:
75
+ - ALL fields must be arrays/lists, even if there's only one value
76
+ - For single values, wrap in brackets: "doc_id": ["single_value"]
77
+ - For multiple values: "coverage_type": ["value1", "value2", "value3"]
78
+ - For null/empty fields, use: null (not empty arrays)
79
+
80
+ ⚠️ Content Rules:
81
+ - For exclusions and obligations, DO NOT copy full sentences.
82
+ - Instead, extract only concise normalized keywords (2–5 words max each).
83
+ - Use existing keywords if they already exist in the provided list.
84
+ - Prefer to reuse existing keywords if they are semantically the same.
85
+ - If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
86
+ *reuse the closest match from existing keywords*, and also add the new one.
87
+ - In that case, set `added_new_keyword=true`.
88
+ - Do not include raw paragraphs in the output.
89
+
90
+ Schema you must follow:
91
+ {schema}
92
+
93
+ Existing Keywords:
94
+ {keywords}
95
+ """),
96
+ ("human", "Text:\n{document_content}")
97
+ ])
98
+ # - Use existing keywords if they already exist in the provided list.
99
+ # - Only create a new keyword if absolutely necessary, and set `added_new_keyword=true`.
100
+ # - New keywords must be short (1–3 words).
101
+ # - Do NOT invent different variations (e.g., if "Medical" already exists, do not output "Mediclaim Plus").
102
+ # - For list fields (like exclusions), reuse existing keywords where possible.
103
+ chain = prompt | self.llm | parser
104
+
105
+ try:
106
+ result = chain.invoke({
107
+ "schema": schema_str,
108
+ "keywords": keywords_str,
109
+ "document_content": document.page_content
110
+ })
111
+ return result
112
+ except OutputParserException as e:
113
+ print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
114
+ return metadata_class(added_new_keyword=False) # instantiate fallback
app/prompts/__init__.py ADDED
File without changes
app/prompts/prompts.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ PARSER_PROMPT = f"""You receive a user's question about an insurance/contract document. Produce a JSON with keys:
2
+ - intent (one of: coverage_check, definition, limit_query, waiting_period, exclusions, other)
3
+ - entities (map of entity_name -> canonical string)
4
+ - constraints (map: plan, time_window, eligible_person, numerical_constraints)
5
+ - answer_type (one of: yes_no, short_explain, detailed, clause_list)
6
+ Return ONLY the JSON.Make sure that nested fields like "entities" and "constraints" are JSON objects, not strings.
7
+ """
app/reseasoning/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/reseasoning/descision_maker.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.request_models import QuerySpec, LogicResult
2
+
3
+
4
+ def evaluate_with_llm(raw_query: str, top_clauses: list, llm):
5
+ """
6
+ Use the LLM to analyze retrieved clauses and return structured decision.
7
+ """
8
+
9
+ # Prepare context for the prompt
10
+ context_clauses = []
11
+ for i, c in enumerate(top_clauses, 1):
12
+ context_clauses.append(f"{i}) [source:{c.doc_id} page:{c.page}] {c.text}")
13
+ print(chr(10).join(context_clauses))
14
+
15
+ # Build prompt
16
+ prompt = f"""
17
+ You are an insurance policy analyst. Question: "{raw_query}"
18
+
19
+ Provided clauses (numbered):
20
+ {chr(10).join(context_clauses)}
21
+
22
+ Task:
23
+ 1) Decide: COVERED / NOT_COVERED / CONDITIONAL
24
+ 2) Summarize the exact clause(s) that justify your decision.
25
+ 3) List any conditions, waiting periods, sublimits, or exclusions relevant.
26
+ 4) Provide a concise final answer (1-2 sentences).
27
+
28
+ Return JSON with these exact keys:
29
+ {{
30
+ "decision": "...",
31
+ "evidence": [
32
+ {{"doc_id": "...", "page": 0, "snippet": "...", "reason": "..."}}
33
+ ],
34
+ "confidence": 0.0,
35
+ "rationale": "...",
36
+ "answer": "..."
37
+ }}
38
+ """
39
+
40
+ # Directly parse to LogicResult using structured output
41
+ structured_llm = llm.with_structured_output(LogicResult)
42
+ result: LogicResult = structured_llm.invoke(prompt)
43
+ # print(f"result: {result}\n result_type{type(result)}")
44
+
45
+ # Attach full text for each evidence
46
+ enriched_evidence = []
47
+ for ev in result.evidence:
48
+ matched = next((c for c in top_clauses if c.doc_id == ev.doc_id and str(c.page) == str(ev.page)), None)
49
+ if matched:
50
+ ev.text = matched.text # or use a different field if needed
51
+ enriched_evidence.append(ev)
52
+
53
+ result.evidence = enriched_evidence
54
+ # print(enriched_evidence[0])
55
+ return result
app/reseasoning/query_parser.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.utils.model_loader import ModelLoader
2
+ from app.schemas.request_models import QuerySpec
3
+ from app.prompts.prompts import PARSER_PROMPT
4
+
5
+ def parsing_query(query:str, llm) -> QuerySpec:
6
+ # Bind the schema to the model
7
+ # model_loader = ModelLoader(model_provider = "gemini")
8
+ # llm = model_loader.load_llm()
9
+
10
+ structured_llm = llm.with_structured_output(QuerySpec)
11
+
12
+ # Compose the full prompt with instructions and user question
13
+ full_prompt = PARSER_PROMPT + "\n" + query
14
+
15
+ # Invoke the model to get structured output parsed as QuerySpec
16
+ result: QuerySpec = structured_llm.invoke(full_prompt)
17
+ return result
18
+ # print(result.json()) # This will print the JSON output matching your schema
app/retrieval/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/retrieval/reranker.py ADDED
File without changes
app/retrieval/retriever.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from app.schemas.request_models import ClauseHit
2
+
3
+ class Retriever:
4
+ def __init__(self, pinecone_index, query = None, metadata = None, namespace=None):
5
+ self.pinecone_index = pinecone_index
6
+ self.query = query
7
+ self.metadata = metadata
8
+ self.namespace = namespace
9
+
10
+
11
+ def retrieval_from_pinecone_vectoreStore(self, top_k= 3):
12
+ """
13
+ Retrieve the top matching chunks from Pinecone.
14
+
15
+ Args:
16
+ pinecone_index: Your Pinecone index object.
17
+ embedding: The vector embedding of the query.
18
+ top_k: How many chunks to retrieve.
19
+ filter_meta: Optional metadata filter dict.
20
+
21
+ Returns:
22
+ List of ClauseHit objects (lightweight container for chunk info).
23
+ """
24
+ res = self.pinecone_index.query(
25
+ vector= self.query,
26
+ top_k =top_k ,
27
+ include_metadata = True,
28
+ include_values = False,
29
+ filter = self.metadata,
30
+ namespace = self.namespace
31
+ )
32
+
33
+ # Process the results into the expected format
34
+ # class ClauseHit:
35
+ # def __init__(self, doc_id, page, chunk_id, text, metadata, score):
36
+ # self.doc_id = doc_id
37
+ # self.page = page
38
+ # self.chunk_id = chunk_id
39
+ # self.text = text
40
+ # self.metadata = metadata
41
+ # self.score = score
42
+
43
+ # hits = []
44
+ # for match in res['matches']:
45
+ # hits.append(ClauseHit(
46
+ # doc_id=match['metadata'].get('doc_id', ''),
47
+ # page=match['metadata'].get('page_no', -1), # Use page_no instead of page
48
+ # chunk_id=match['metadata'].get('chunk_id', ''),
49
+ # text=match['metadata'].get('text', match.get('text', '')),
50
+ # metadata=match['metadata'],
51
+ # score=match['score']
52
+ # ))
53
+ # return hits
54
+ return res
55
+
app/schemas/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/schemas/metadata_schema.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, model_validator,field_validator, HttpUrl, Field
2
+ from typing import List, Dict, Any, Optional, Union, Literal
3
+
4
+ class CommonMetaData(BaseModel):
5
+ # --- Common metadata (across all domains) ---
6
+ doc_id: Optional[List[str]] = Field(None, description="Unique document identifier")
7
+ doc_category: Optional[List[str]] = Field(None, description="General pool/category e.g. Insurance, HR, Legal")
8
+ doc_type: Optional[List[str]] = Field(None, description="Specific type e.g. Policy doc, Contract, Handbook")
9
+ jurisdiction: Optional[List[str]] = Field(
10
+ default=None, description="Applicable jurisdictions/regions/countries"
11
+ )
12
+ effective_date: Optional[List[str]] = Field(None, description="Date from which the document is effective")
13
+ expiry_date: Optional[List[str]] = Field(None, description="Date until which the document is valid")
14
+ parties: Optional[List[str]] = Field(None, description="Involved parties (e.g., employer/employee, insurer/insured)")
15
+ # obligations: Optional[List[str]] = Field(
16
+ # default=None,
17
+ # description="List of short, normalized obligation keywords (2–5 words each, no full sentences)"
18
+ # )
19
+ penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
20
+ notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
21
+
22
+ class InsuranceMetadata(CommonMetaData):
23
+
24
+ # --- Insurance ---
25
+ policy_number: Optional[List[str]] = None
26
+ coverage_type: Optional[List[str]] = Field(
27
+ default=None,
28
+ description="Type(s) of coverage. Short keywords (1–3 words each)."
29
+ )
30
+ premium_amount: Optional[List[str]] = None
31
+ exclusions: Optional[List[str]] = Field(
32
+ description="List of normalized keywords representing exclusions (short, 2-5 words each, not full paragraphs).", default=None
33
+ )
34
+ added_new_keyword: bool = False
35
+
36
+ class HRMetadata(CommonMetaData):
37
+ # --- HR / Employment ---
38
+ policy_type: Optional[str] = None
39
+ applicable_roles: Optional[List[str]] = None
40
+ notice_period: Optional[str] = None
41
+
42
+ class LegalMetadata(CommonMetaData):
43
+
44
+ # --- Legal / Compliance ---
45
+ clause_type: Optional[str] = None
46
+ governing_law: Optional[str] = None
47
+ duration: Optional[str] = None
48
+
49
+ class FinancialMetadata(CommonMetaData):
50
+
51
+ # --- Financial / Regulatory ---
52
+ section: Optional[str] = None
53
+ compliance_requirement: Optional[str] = None
54
+ reporting_frequency: Optional[str] = None
55
+
56
+ class HealthcareMetadata(CommonMetaData):
57
+
58
+ # --- Healthcare / Pharma ---
59
+ disease: Optional[str] = None
60
+ treatment_limit: Optional[str] = None
61
+ validity_period: Optional[str] = None
62
+
63
+ class ProcurementMetadata(CommonMetaData):
64
+
65
+ # --- Procurement / Vendor Management ---
66
+ vendor_name: Optional[str] = None
67
+ contract_value: Optional[str] = None
68
+ payment_terms: Optional[str] = None
69
+ sla_metrics: Optional[List[str]] = None
70
+
71
+
72
+
73
+
74
+
75
+
app/schemas/request_models.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, model_validator,field_validator, HttpUrl, Field
2
+ from typing import List, Dict, Any, Optional, Union, Literal
3
+ import json
4
+ class QueryRequest(BaseModel):
5
+ query: str
6
+
7
+
8
+
9
+
10
+
11
+ # class QuerySpec(BaseModel):
12
+ # raw_query: str # query of the user
13
+ # intent: str # High-level purpose, e.g., "coverage_check" — helps routing aur rules.
14
+ # entities: Dict[str, Union[str, List[str]]] = Field(default_factory= dict) # Extracted items (policy number, dates, amounts) — structured
15
+ # constraints : Dict[str, Any] = Field(default_factory=dict) # filters like {"jurisdiction":"IN","incident_date":"2024-01-01"}
16
+ # answer_type: Optional[str] = "detailed"
17
+ # followups: Optional[List[str]] = Field(default_factory=list) # followups for user
18
+
19
+ # @model_validator(mode = "before")
20
+ # @classmethod
21
+ # def parse_nested_json(cls, values): # parsing nested json to load
22
+ # for field in ['entities', 'constraints']:
23
+ # val = values.get(field)
24
+ # if isinstance(val, str):
25
+ # try:
26
+ # values[field] = json.loads(val)
27
+ # except json.JSONDecodeError:
28
+ # pass
29
+ # return values
30
+
31
+ # class ClauseHit(BaseModel):
32
+ # doc_id : str # id of the document
33
+ # page: int # pdf page id
34
+ # chunk_id: str
35
+ # text: str # Evidence text used for answer.
36
+ # metadata: Dict[str, Any] = Field(default_factory=dict) # metadata
37
+ # score: float # Retrieval similarity score
38
+ # boost: Optional[float] = None
39
+ # combined_score: Optional[float] = None
40
+
41
+ # @field_validator("metadata", mode="before")
42
+ # def parse_metadata(cls, v):
43
+ # if isinstance(v, str):
44
+ # try:
45
+ # return json.loads(v) if v.strip() else {}
46
+ # except json.JSONDecodeError:
47
+ # return {}
48
+ # return v
49
+
50
+ # class LogicResult(BaseModel):
51
+ # answer: str
52
+ # decision: str # "covered"/"not_covered"/"conditional"
53
+ # confidence: float # 0..1 score for calibration/thresholding.
54
+ # evidence: List[ClauseHit] = Field(default_factory=list) # List of ClauseHit used to justify the answer.
55
+ # rationale: Optional[str] = None # Short human-readable reason (audit-friendly).
56
+
57
+ # class HackRxRunRequest(BaseModel):
58
+ # documents: HttpUrl = Field(
59
+ # ...,
60
+ # description="URL to the document (PDF, DOCX, or email blob)"
61
+ # )
62
+ # questions: List[str] = Field(
63
+ # ...,
64
+ # description="List of questions to query against the document"
65
+ # )
66
+
67
+ class DocumentTypeSchema(BaseModel):
68
+ document_types: Literal[
69
+ "HR/Employment",
70
+ "Insurance",
71
+ "Legal/Compliance",
72
+ "Financial/Regulatory",
73
+ "Government/Public Policy",
74
+ "Technical/IT Policies"
75
+ ] = Field(..., description="The category/type of the document")
76
+
77
+
app/schemas/response_models.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List, Dict, Any, Optional
3
+
4
+ class SourceDocument(BaseModel):
5
+ doc_id: str
6
+ page: int
7
+ text: str
8
+ score: float
9
+ metadata: Dict[str, Any]
10
+
11
+ class QueryResponse(BaseModel):
12
+ session_id: str
13
+ query: str
14
+ answer: str
15
+ query_metadata: Optional[Dict[str, Any]] = None
16
+ sources: Optional[List[SourceDocument]] = None
17
+ message: str
18
+
19
+ class SessionResponse(BaseModel):
20
+ session_id: str
21
+ message: str
22
+
23
+ class UploadResponse(BaseModel):
24
+ session_id: str
25
+ filename: str
26
+ document_type: str
27
+ chunks_created: int
28
+ message: str
29
+
30
+ class ErrorResponse(BaseModel):
31
+ detail: str
32
+ error_code: Optional[str] = None
app/services/RAG_service.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from app.utils.model_loader import ModelLoader
3
+ from app.ingestion.file_loader import FileLoader
4
+ from app.ingestion.text_splitter import splitting_text
5
+ from app.retrieval.retriever import Retriever
6
+ from app.embedding.embeder import QueryEmbedding
7
+ from app.embedding.vectore_store import VectorStore
8
+ from app.metadata_extraction.metadata_ext import MetadataExtractor
9
+ from app.utils.metadata_utils import MetadataService
10
+ # from app.utils.document_op import DocumentOperation
11
+ from langchain_core.documents import Document
12
+ import json
13
+ from typing import List, Optional
14
+ # ...existing imports...
15
+
16
+ # Global model instances (loaded once)
17
+ _embedding_model = None
18
+
19
+ def get_models():
20
+ global _embedding_model
21
+ if _embedding_model is None:
22
+ print("Loading models (one-time initialization)...")
23
+ embedding_loader = ModelLoader(model_provider="huggingface")
24
+ _embedding_model = embedding_loader.load_llm()
25
+ return _embedding_model
26
+
27
+ class RAGService:
28
+ def __init__(self):
29
+ print("[RAGService] Initializing service...")
30
+ self._init_models()
31
+ self.Docuement_Type = None
32
+ self.Pinecone_index = None
33
+ self.Document_path = None
34
+ self.Document_Type = None
35
+ self.DocumentTypeScheme = None
36
+ self.url = None
37
+ self.chunks = None
38
+ self.vector_store = None
39
+ self.index = None
40
+ self.namespace = None
41
+ self.retriever = None
42
+ self.metadataservice = MetadataService()
43
+ print("[RAGService] Initialization complete.")
44
+
45
+ def _init_models(self):
46
+ """Initialize LLM and embedding Models"""
47
+ print("[RAGService] Loading LLM model (gemini)...")
48
+ self.model_loader = ModelLoader(model_provider="gemini")
49
+ self.llm = self.model_loader.load_llm()
50
+ print("[RAGService] LLM model loaded.")
51
+ print("[RAGService] Loading embedding model (huggingface)...")
52
+ # self.model_loader = ModelLoader(model_provider="huggingface")
53
+ self.embedding_model = get_models()
54
+ print("[RAGService] Embedding model loaded.")
55
+
56
+ def load_and_split_document(self, type:str, path:str= None, url:str = None):
57
+ """Load and chunk document from local path or URL"""
58
+ print(f"[RAGService] Loading document. Type: {type}, Path: {path}, URL: {url}")
59
+ file_loader = FileLoader(llm = self.llm)
60
+ if type == "pdf":
61
+ if path:
62
+ print(f"[RAGService] Loading PDF from path: {path}")
63
+ doc = file_loader.load_pdf(path)
64
+ elif url:
65
+ print(f"[RAGService] Loading PDF from URL: {url}")
66
+ doc = file_loader.load_documents_from_url(url)
67
+ else:
68
+ print("[RAGService] Error: Either path or url must be provided for PDF.")
69
+ raise ValueError("Either path or url must be provided for PDF.")
70
+ elif type == "word":
71
+ if path:
72
+ print(f"[RAGService] Loading Word document from path: {path}")
73
+ doc = file_loader.load_word_document(path)
74
+ elif url:
75
+ print("[RAGService] Error: URL loading not supported for Word documents.")
76
+ raise ValueError("URL loading not supported for Word documents.")
77
+ else:
78
+ print("[RAGService] Error: Path must be provided for Word document.")
79
+ raise ValueError("Path must be provided for Word document.")
80
+ else:
81
+ print("[RAGService] Error: Unsupported document type.")
82
+ raise ValueError("Unsupported document type. Use 'pdf' or 'word'.")
83
+
84
+ print("[RAGService] Detecting document type scheme...")
85
+ self.DocumentTypeScheme = file_loader.detect_document_type(doc[0:2])
86
+ print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
87
+ self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
88
+ print(f"[RAGService] Document type model: {self.Document_Type}")
89
+ self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm)
90
+ print("[RAGService] Splitting document into chunks...")
91
+ self.chunks = self.splitter.text_splitting(doc)
92
+ print(f"[RAGService] Total chunks created: {len(self.chunks)}")
93
+
94
+ def create_query_embedding(self, query: str):
95
+ print("[RAGService] Creating query embedding...")
96
+ self.query_embedder = QueryEmbedding(query=query, embedding_model=self.embedding_model)
97
+ self.query_embedding = self.query_embedder.get_embedding()
98
+ print(f"[RAGService] Query embedding created: {self.query_embedding}")
99
+ langchain_doc = Document(page_content=query)
100
+ print("[RAGService] Extracting metadata for the query...")
101
+ self.metadataExtractor = MetadataExtractor(llm=self.llm)
102
+ with open(self.splitter.Keywordsfile_path, "r") as f:
103
+ known_keywords = json.load(f)
104
+ raw_metadata = self.metadataExtractor.extractMetadata_query(self.Document_Type,langchain_doc, known_keywords = known_keywords)
105
+ print(f"[RAGService] Query metadata extracted: {raw_metadata}")
106
+ # Convert to dictionary and format for Pinecone
107
+ metadata_dict = raw_metadata.model_dump(exclude_none=True)
108
+ formatted_metadata = self.metadataservice.format_metadata_for_pinecone(metadata_dict)
109
+
110
+ # Remove problematic fields that cause serialization issues
111
+ self.query_metadata = {
112
+ k: v for k, v in formatted_metadata.items()
113
+ if k not in ["obligations", "exclusions", "notes", "added_new_keyword"]
114
+ }
115
+
116
+ print(f"[RAGService] Query metadata type: {type(self.query_metadata)}")
117
+ print(f"[RAGService] Query metadata: {self.query_metadata}")
118
+
119
+ def create_vector_store(self):
120
+ print("[RAGService] Creating vector store...")
121
+ self.vector_store = VectorStore(self.chunks, self.embedding_model)
122
+ self.index, self.namespace = self.vector_store.create_vectorestore()
123
+ print(f"[RAGService] Vector store created. Index: {self.index}, Namespace: {self.namespace}")
124
+
125
+ def retrive_documents(self):
126
+ print("[RAGService] Retrieving documents from vector store...")
127
+ self.retriever = Retriever(self.index,self.query_embedding,self.query_metadata, self.namespace)
128
+ self.result = self.retriever.retrieval_from_pinecone_vectoreStore()
129
+ print(f"[RAGService] Retrieval result: {self.result}")
130
+
131
+ def answer_query(self, raw_query:str) -> str:
132
+ """Answer user query using retrieved documents and LLM"""
133
+ print(f"[RAGService] Answering query: {raw_query}")
134
+ top_clause = self.result['matches']
135
+ top_clause_dicts = [r.to_dict() for r in top_clause]
136
+ self.top_clauses = top_clause_dicts
137
+ keys_to_remove = {"file_path", "source", "producer", "keywords", "subject", "added_new_keyword", "author", "chunk_id"}
138
+ for r in top_clause_dicts:
139
+ meta = r.get("metadata", {})
140
+ for k in keys_to_remove:
141
+ meta.pop(k, None)
142
+
143
+ context_clauses = json.dumps(top_clause_dicts, separators=(",", ":"))
144
+
145
+ print(f"context_clauses: {context_clauses}")
146
+
147
+ prompt = f"""
148
+ You are a legal/insurance domain expert and policy analyst.
149
+ Use the following extracted clauses from policy documents to answer the question.
150
+ If you can't find the answer, say "I don't know".
151
+ Context clauses:
152
+ {"".join(context_clauses)}
153
+ Question: {raw_query}
154
+ """
155
+ print("[RAGService] Invoking LLM with prompt...")
156
+ response = self.llm.invoke(prompt)
157
+ print(f"[RAGService] LLM response: {response}")
158
+
159
+ # Extract string content from response object
160
+ if hasattr(response, 'content'):
161
+ return response.content
162
+ elif isinstance(response, str):
163
+ return response
164
+ else:
165
+ return str(response)
app/services/__init__.py ADDED
File without changes
app/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file is automatically created to mark the directory as a package.
app/utils/config_loader.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+
4
+ def load_config(config_path: str = "app/config/config.yaml") -> dict:
5
+ with open(config_path, "r") as file:
6
+ config = yaml.safe_load(file)
7
+ # print(config)
8
+ return config
app/utils/document_op.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ class DocumentOperation:
3
+ @staticmethod
4
+ def get_file_type_by_extension(filename):
5
+ _, extension = os.path.splitext(filename)
6
+ extension = extension.lower()
7
+ if extension == ".txt":
8
+ return "text"
9
+ elif extension == ".pdf":
10
+ return "pdf"
11
+ elif extension in [".doc", ".docx"]:
12
+ return "word"
13
+ else:
14
+ return "unknown"
app/utils/embedding_manager.py ADDED
File without changes
app/utils/logger.py ADDED
File without changes
app/utils/metadata_utils.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData
2
+ from app.schemas.request_models import DocumentTypeSchema
3
+ class MetadataService:
4
+ def __init__(self):
5
+ self.metadata_models = {
6
+ "Insurance": InsuranceMetadata,
7
+ "HR/Employment": CommonMetaData,
8
+ "Legal/Compliance": CommonMetaData,
9
+ "Financial/Regulatory": CommonMetaData,
10
+ "Government/Public Policy": CommonMetaData,
11
+ "Technical/IT Policies": CommonMetaData
12
+ }
13
+ @staticmethod
14
+ def format_metadata_for_pinecone(metadata: dict) -> dict:
15
+ """
16
+ Convert list fields in metadata to Pinecone's valid filter format using $in.
17
+ """
18
+ formatted = {}
19
+ for key, value in metadata.items():
20
+ if isinstance(value, list):
21
+ if len(value) > 0:
22
+ formatted[key] = {"$in": value}
23
+ else:
24
+ formatted[key] = value
25
+ return formatted
26
+
27
+ def Return_document_model(self, doc_type_schema: DocumentTypeSchema):
28
+ """
29
+ Returns appropriate metadata model based on document type
30
+
31
+ Args:
32
+ doc_type_schema: DocumentTypeSchema object containing document type
33
+
34
+ Returns:
35
+ Appropriate Pydantic model class for the document type
36
+ """
37
+ doc_type = doc_type_schema.document_types
38
+ return self.metadata_models.get(doc_type, CommonMetaData)
39
+
40
+ @staticmethod
41
+ def normalize_dict_to_lists(metadata: dict) -> dict:
42
+ """Convert dict values to lists if they aren't already"""
43
+ normalized = {}
44
+ for key, value in metadata.items():
45
+ if value is None:
46
+ normalized[key] = []
47
+ elif isinstance(value, list):
48
+ normalized[key] = value
49
+ else:
50
+ normalized[key] = [value]
51
+ return normalized
app/utils/model_loader.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from config_loader import
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from pydantic import BaseModel, Field
5
+ from typing import Literal, Optional,Any
6
+ from app.utils.config_loader import load_config
7
+ from langchain_groq import ChatGroq
8
+ from langchain_google_genai import ChatGoogleGenerativeAI
9
+ from dotenv import load_dotenv
10
+ from langchain_huggingface import HuggingFaceEmbeddings
11
+ # from langchain_openai import OpenAIEmbeddings
12
+ from langchain_community.embeddings import OpenAIEmbeddings
13
+ class ConfigLoader:
14
+ def __init__(self):
15
+ print(f"Loading config....")
16
+ self.config = load_config()
17
+
18
+ def __getitem__(self,key):## This method allows you to access config values using dictionary-like syntax
19
+ return self.config[key]
20
+
21
+
22
+ class ModelLoader(BaseModel):
23
+ model_provider: Literal["groq", "gemini", "openai","gemini_lite", "huggingface"] = "gemini"
24
+ config: Optional[ConfigLoader] = Field(default = None, exclude = True) # either the config is ConfigLoader object or None
25
+
26
+ def model_post_init(self, __context: Any)->None:
27
+ self.config = ConfigLoader() # Automatically ensures that whenever you create ModelLoader, it loads the config.. model_post_init is a Pydantic V2 hook, which runs after model creation.It assigns a ConfigLoader() instance to self.config.This ensures the configuration is loaded whenever you create a ModelLoader.
28
+
29
+ class Config:
30
+ arbitrary_types_allowed = True # Allows ConfigLoader (a non-Pydantic class) to be used as a field in the model.
31
+
32
+ def load_llm(self):
33
+ """
34
+ Load and return the LLM model
35
+ """
36
+ print("LLM loading...")
37
+ print("Loading model from provider: ")
38
+ if self.model_provider == "groq":
39
+ print("Loading model from GROQ:")
40
+ groq_api_key = os.getenv("GROQ_API_KEY")
41
+ model_name = self.config["llm"]["groq"]["model_name"]
42
+ llm = ChatGroq(model = model_name, api_key = groq_api_key)
43
+
44
+ elif self.model_provider =="gemini":
45
+ print("Loading model from gemini:")
46
+ load_dotenv()
47
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
48
+ model_name = self.config["llm"]["gemini"]["model_name"]
49
+ llm = ChatGoogleGenerativeAI(
50
+ model=model_name,
51
+ google_api_key= gemini_api_key
52
+ )
53
+ elif self.model_provider =="gemini_lite":
54
+ print("Loading model from gemini-flash-lite:")
55
+ load_dotenv()
56
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
57
+ model_name = self.config["llm"]["gemini_lite"]["model_name"]
58
+ llm = ChatGoogleGenerativeAI(
59
+ model=model_name,
60
+ google_api_key= gemini_api_key
61
+ )
62
+ elif self.model_provider =="openai":
63
+ load_dotenv()
64
+ print("Loading model from openai:")
65
+ api_key = os.getenv("OPENAI_API_KEY")
66
+ model_name = self.config["embedding_model"]["openai"]["model_name"]
67
+ llm = OpenAIEmbeddings(model=model_name, api_key = api_key)
68
+ elif self.model_provider =="huggingface":
69
+ load_dotenv()
70
+ print("Loading model from huggingface:")
71
+ print("HF_TOKEN in env:", os.getenv("HF_TOKEN"))
72
+ api_key = os.getenv("HF_TOKEN")
73
+ print(f"HF api key {api_key}")
74
+ os.environ["HF_TOKEN"] = api_key # Ensure the token is set in the environment
75
+ model_name = self.config["embedding_model"]["huggingface"]["model_name"]
76
+ llm = HuggingFaceEmbeddings(model=model_name)
77
+ else:
78
+ raise ValueError(f"Unsupported model provider: {self.model_provider}")
79
+ return llm
80
+
81
+
82
+
83
+
experiments.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import JSONResponse
4
+ import uvicorn
5
+ import os
6
+ import tempfile
7
+ from typing import Optional
8
+ import uuid
9
+ from datetime import datetime
10
+
11
+ from app.api.v1.routes import router as api_router
12
+ from app.core.session_manager import session_manager
13
+ from app.config.config import get_settings
14
+
15
+ # Initialize FastAPI app
16
+ app = FastAPI(
17
+ title="ClariDoc API",
18
+ description="Professional Document Analysis & RAG Platform API",
19
+ version="1.0.0"
20
+ )
21
+
22
+ # Add CORS middleware
23
+ app.add_middleware(
24
+ CORSMiddleware,
25
+ allow_origins=["*"], # Configure this properly for production
26
+ allow_credentials=True,
27
+ allow_methods=["*"],
28
+ allow_headers=["*"],
29
+ )
30
+
31
+ @app.on_event("startup")
32
+ async def startup_event():
33
+ """Initialize database and other startup tasks"""
34
+ try:
35
+ # Test database connection
36
+ session_manager.db.init_db()
37
+ print("Database connection verified successfully")
38
+ except Exception as e:
39
+ print(f"Warning: Database initialization failed: {e}")
40
+
41
+ # Include API routes
42
+ app.include_router(api_router, prefix="/api/v1")
43
+
44
+ @app.get("/")
45
+ async def root():
46
+ return {
47
+ "message": "ClariDoc API",
48
+ "status": "running",
49
+ "description": "Professional Document Analysis & RAG Platform"
50
+ }
51
+
52
+ @app.get("/health")
53
+ async def health_check():
54
+ """Health check endpoint for Docker and monitoring"""
55
+ try:
56
+ # Test database connection
57
+ session_manager.db.init_db()
58
+ db_status = "healthy"
59
+ except Exception:
60
+ db_status = "unhealthy"
61
+
62
+ return {
63
+ "status": "healthy",
64
+ "service": "ClariDoc FastAPI Backend",
65
+ "database": db_status,
66
+ "timestamp": datetime.now()
67
+ }
68
+
69
+ if __name__ == "__main__":
70
+ uvicorn.run(
71
+ "app.main:app",
72
+ host="0.0.0.0",
73
+ port=int(os.getenv("PORT", 8000)),
74
+ reload=False, # Disable reload in production
75
+ log_level="info"
76
+ )
pyproject.toml ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "rag-app"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = "==3.12.4"
7
+ dependencies = [
8
+ "acres==0.5.0",
9
+ "aiohappyeyeballs==2.6.1",
10
+ "aiohttp==3.12.15",
11
+ "aiohttp-retry==2.9.1",
12
+ "aiosignal==1.4.0",
13
+ "annotated-types==0.7.0",
14
+ "anyio==4.10.0",
15
+ "asttokens==3.0.0",
16
+ "attrs==25.3.0",
17
+ "beautifulsoup4==4.13.4",
18
+ "cachetools==5.5.2",
19
+ "certifi==2025.8.3",
20
+ "cffi==1.17.1",
21
+ "charset-normalizer==3.4.3",
22
+ "ci-info==0.3.0",
23
+ "click==8.2.1",
24
+ "colorama==0.4.6",
25
+ "colorclass==2.2.2",
26
+ "comm==0.2.3",
27
+ "compressed-rtf==1.0.7",
28
+ "configobj==5.0.9",
29
+ "configparser==7.2.0",
30
+ "cryptography==45.0.6",
31
+ "dataclasses-json==0.6.7",
32
+ "debugpy==1.8.16",
33
+ "decorator==5.2.1",
34
+ "distro==1.9.0",
35
+ "docx==0.2.4",
36
+ "docx2txt>=0.9",
37
+ "easygui==0.98.3",
38
+ "ebcdic==1.1.1",
39
+ "etelemetry==0.3.1",
40
+ "executing==2.2.0",
41
+ "extract-msg==0.55.0",
42
+ "fastapi==0.116.1",
43
+ "filelock==3.18.0",
44
+ "filetype==1.2.0",
45
+ "frozenlist==1.7.0",
46
+ "google-ai-generativelanguage==0.6.18",
47
+ "google-api-core==2.25.1",
48
+ "google-auth==2.40.3",
49
+ "googleapis-common-protos==1.70.0",
50
+ "greenlet==3.2.4",
51
+ "groq==0.31.0",
52
+ "grpcio==1.74.0",
53
+ "grpcio-status==1.74.0",
54
+ "h11==0.16.0",
55
+ "hf-xet>=1.1.8",
56
+ "httpcore==1.0.9",
57
+ "httplib2==0.22.0",
58
+ "httpx==0.28.1",
59
+ "httpx-sse==0.4.1",
60
+ "huggingface-hub[cli]>=0.34.4",
61
+ "idna==3.10",
62
+ "iniconfig==2.1.0",
63
+ "ipykernel==6.30.1",
64
+ "ipython==9.4.0",
65
+ "ipython-pygments-lexers==1.1.1",
66
+ "jedi==0.19.2",
67
+ "jiter==0.10.0",
68
+ "joblib==1.5.1",
69
+ "jsonpatch==1.33",
70
+ "jsonpointer==3.0.0",
71
+ "jupyter-client==8.6.3",
72
+ "jupyter-core==5.8.1",
73
+ "langchain==0.3.27",
74
+ "langchain-community==0.3.27",
75
+ "langchain-core==0.3.74",
76
+ "langchain-google-genai==2.1.9",
77
+ "langchain-groq==0.3.7",
78
+ "langchain-huggingface>=0.3.1",
79
+ "langchain-openai==0.3.29",
80
+ "langchain-pinecone==0.2.11",
81
+ "langchain-tests==0.3.20",
82
+ "langchain-text-splitters==0.3.9",
83
+ "langextract>=1.0.8",
84
+ "langsmith==0.4.13",
85
+ "lark==1.1.9",
86
+ "looseversion==1.3.0",
87
+ "lxml==6.0.0",
88
+ "markdown-it-py==4.0.0",
89
+ "marshmallow==3.26.1",
90
+ "matplotlib-inline==0.1.7",
91
+ "mdurl==0.1.2",
92
+ "msoffcrypto-tool==5.4.2",
93
+ "multidict==6.6.4",
94
+ "mypy-extensions==1.1.0",
95
+ "nest-asyncio==1.6.0",
96
+ "networkx==3.5",
97
+ "nibabel==5.3.2",
98
+ "nipype==1.10.0",
99
+ "numpy==2.3.2",
100
+ "olefile==0.47",
101
+ "oletools==0.60.2",
102
+ "openai==1.99.7",
103
+ "orjson==3.11.1",
104
+ "packaging==24.2",
105
+ "pandas==2.3.1",
106
+ "parso==0.8.4",
107
+ "pathlib==1.0.1",
108
+ "pcodedmp==1.2.6",
109
+ "pillow==11.3.0",
110
+ "pinecone==7.3.0",
111
+ "pinecone-plugin-assistant==1.7.0",
112
+ "pinecone-plugin-interface==0.0.7",
113
+ "platformdirs==4.3.8",
114
+ "pluggy==1.6.0",
115
+ "prompt-toolkit==3.0.51",
116
+ "propcache==0.3.2",
117
+ "proto-plus==1.26.1",
118
+ "protobuf==6.31.1",
119
+ "prov==2.1.1",
120
+ "psutil==7.0.0",
121
+ "pure-eval==0.2.3",
122
+ "puremagic==1.30",
123
+ "py-cpuinfo==9.0.0",
124
+ "pyasn1==0.6.1",
125
+ "pyasn1-modules==0.4.2",
126
+ "pycparser==2.22",
127
+ "pydantic==2.11.7",
128
+ "pydantic-core==2.33.2",
129
+ "pydantic-settings==2.10.1",
130
+ "pydot==4.0.1",
131
+ "pygments==2.19.2",
132
+ "pymupdf==1.26.3",
133
+ "pyparsing==3.2.3",
134
+ "pypdf>=6.0.0",
135
+ "pytest==8.4.1",
136
+ "pytest-asyncio==0.26.0",
137
+ "pytest-benchmark==5.1.0",
138
+ "pytest-codspeed==4.0.0",
139
+ "pytest-recording==0.13.4",
140
+ "pytest-socket==0.7.0",
141
+ "python-dateutil==2.9.0.post0",
142
+ "python-docx>=1.2.0",
143
+ "python-dotenv==1.1.1",
144
+ "python-magic>=0.4.27",
145
+ "python-magic-bin>=0.4.14",
146
+ "python-multipart>=0.0.20",
147
+ "pytz==2025.2",
148
+ "pyxnat==1.6.3",
149
+ "pyyaml==6.0.2",
150
+ "pyzmq==27.0.1",
151
+ "rdflib==7.1.4",
152
+ "red-black-tree-mod==1.22",
153
+ "regex==2025.7.34",
154
+ "requests==2.32.4",
155
+ "requests-toolbelt==1.0.0",
156
+ "rich==14.1.0",
157
+ "rsa==4.9.1",
158
+ "rtfde==0.1.2.1",
159
+ "scikit-learn==1.7.1",
160
+ "scipy==1.16.1",
161
+ "sentence-transformers>=5.1.0",
162
+ "simplejson==3.20.1",
163
+ "six==1.17.0",
164
+ "sniffio==1.3.1",
165
+ "soupsieve==2.7",
166
+ "sqlalchemy==2.0.42",
167
+ "stack-data==0.6.3",
168
+ "starlette==0.47.2",
169
+ "streamlit>=1.49.1",
170
+ "syrupy==4.9.1",
171
+ "tenacity==9.1.2",
172
+ "threadpoolctl==3.6.0",
173
+ "tiktoken==0.11.0",
174
+ "tornado==6.5.2",
175
+ "tqdm==4.67.1",
176
+ "traitlets==5.14.3",
177
+ "traits==7.0.2",
178
+ "typing-extensions==4.14.1",
179
+ "typing-inspect==0.9.0",
180
+ "typing-inspection==0.4.1",
181
+ "tzdata==2025.2",
182
+ "tzlocal==5.3.1",
183
+ "urllib3<2",
184
+ "uvicorn>=0.35.0",
185
+ "vcrpy==7.0.0",
186
+ "wcwidth==0.2.13",
187
+ "win-unicode-console==0.5",
188
+ "wrapt==1.17.2",
189
+ "yarl==1.20.1",
190
+ "zstandard==0.23.0",
191
+ ]