Spaces:

Kshitijk20
/

ClariDoc

Sleeping

App Files Files Community

Kshitijk20 commited on Sep 1, 2025

Commit

e5b884f

1 Parent(s): f60655e

code push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +17 -0
.python-version +1 -0
Best.md +2 -0
Dockerfile +30 -0
app/__init__.py +1 -0
app/api/__init__.py +0 -0
app/api/deps.py +0 -0
app/api/v1/__init__.py +0 -0
app/api/v1/routes.py +255 -0
app/config/__init__.py +1 -0
app/config/config.py +37 -0
app/config/config.yaml +20 -0
app/core/__init__.py +0 -0
app/core/session_manager.py +198 -0
app/data/__init__.py +0 -0
app/database/database.py +177 -0
app/database/sessions.db +0 -0
app/embedding/__init__.py +1 -0
app/embedding/embeder.py +12 -0
app/embedding/vectore_store.py +40 -0
app/ingestion/__init__.py +1 -0
app/ingestion/file_loader.py +80 -0
app/ingestion/text_splitter.py +99 -0
app/main.py +0 -0
app/metadata_extraction/__init__.py +0 -0
app/metadata_extraction/metadata_ext.py +114 -0
app/prompts/__init__.py +0 -0
app/prompts/prompts.py +7 -0
app/reseasoning/__init__.py +1 -0
app/reseasoning/descision_maker.py +55 -0
app/reseasoning/query_parser.py +18 -0
app/retrieval/__init__.py +1 -0
app/retrieval/reranker.py +0 -0
app/retrieval/retriever.py +55 -0
app/schemas/__init__.py +1 -0
app/schemas/metadata_schema.py +75 -0
app/schemas/request_models.py +77 -0
app/schemas/response_models.py +32 -0
app/services/RAG_service.py +165 -0
app/services/__init__.py +0 -0
app/utils/__init__.py +1 -0
app/utils/config_loader.py +8 -0
app/utils/document_op.py +14 -0
app/utils/embedding_manager.py +0 -0
app/utils/logger.py +0 -0
app/utils/metadata_utils.py +51 -0
app/utils/model_loader.py +83 -0
experiments.ipynb +0 -0
main.py +76 -0
pyproject.toml +191 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+.env
+env
+.env/
+hackrx_rag_app
+hackrx_rag_app/
+.hackrx_rag_app
+hackrx_rag_app/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Best.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ## Text splitting techniques
2	+ 1.

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.12-slim
+# Set working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the entire application
+COPY . .
+# Create necessary directories
+RUN mkdir -p app/uploads app/data
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PORT=7860
+# Expose the port that Hugging Face Spaces expects
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# Command to run the FastAPI application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/api/__init__.py ADDED Viewed

File without changes

app/api/deps.py ADDED Viewed

File without changes

app/api/v1/__init__.py ADDED Viewed

File without changes

app/api/v1/routes.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from fastapi import APIRouter, HTTPException, Depends, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from typing import Optional
+import os
+import tempfile
+from pathlib import Path
+from app.core.session_manager import SessionManager, Session, session_manager
+from app.services.RAG_service import RAGService
+from app.schemas.request_models import QueryRequest
+from app.schemas.response_models import SessionResponse, QueryResponse, UploadResponse, SourceDocument
+from app.config.config import get_settings
+router = APIRouter()
+def get_session(session_id:str) -> Session:
+    """Dependency to get and validare session"""
+    print(f"[DEBUG] Looking for session: {session_id}")
+    print(f"[DEBUG] Available sessions: {list(session_manager.sessions.keys())}")
+    session = session_manager.get_session(session_id=session_id)
+    if not session:
+        print(f"[DEBUG] Session not found: {session_id}")
+        raise HTTPException(status_code=404, detail="Session not found or expired"
+        )
+    print(f"[DEBUG] Session found: {session_id}")
+    return session
+@router.post("/session", response_model=SessionResponse)
+async def create_session(username: str = "anonymous"):
+    """Create a new session for document processing"""
+    session_id = session_manager.create_session(username)
+    print(f"[DEBUG] Created session: {session_id} for user: {username}")
+    print(f"[DEBUG] Total sessions now: {len(session_manager.sessions)}")
+    return SessionResponse(
+        session_id=session_id,
+        message="Session created successfully"
+    )
+@router.get("/sessions/{username}")
+async def get_user_sessions(username: str):
+    """Get all sessions for a user"""
+    sessions = session_manager.get_user_sessions(username)
+    return {"sessions": sessions}
+@router.post("/session/{session_id}/restore")
+async def restore_session(session_id: str):
+    """Restore a session from database"""
+    success = session_manager.restore_session(session_id)
+    if success:
+        return {"message": "Session restored successfully"}
+    else:
+        raise HTTPException(status_code=404, detail="Session not found or inactive")
+@router.delete("/session/{session_id}")
+async def delete_session(session_id: str):
+    """Delete a session"""
+    session_manager.delete_session(session_id)
+    return {"message": "Session deleted successfully"}
+@router.post("/upload/{session_id}", response_model=UploadResponse)
+async def upload_document(
+    session_id: str,
+    file: UploadFile = File(None),
+    url: str = Form(None),
+    doc_type: str = Form(None),
+    session: Session = Depends(get_session)
+):
+    """Upload and process a document from file or URL"""
+    settings = get_settings()
+    # Validate input - either file or URL must be provided
+    if not file and not url:
+        raise HTTPException(status_code=400, detail="Either file or URL must be provided")
+    if file and url:
+        raise HTTPException(status_code=400, detail="Provide either file or URL, not both")
+    try:
+        # Initialize RAG service for this session
+        session.rag_service = RAGService()
+        document_name = ""
+        document_path = None
+        document_url = None
+        if file:
+            # Handle file upload
+            file_extension = Path(file.filename).suffix.lower()
+            if file_extension not in settings.allowed_file_types:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"File type {file_extension} not allowed. Allowed types: {settings.allowed_file_types}"
+                )
+            # Auto-detect document type if not provided
+            if not doc_type:
+                doc_type = "pdf" if file_extension == ".pdf" else "word"
+            # Validate file size
+            content = await file.read()
+            if len(content) > settings.max_file_size:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"File size too large. Maximum size: {settings.max_file_size} bytes"
+                )
+            # Create upload directory
+            upload_dir = Path(settings.upload_dir)
+            upload_dir.mkdir(exist_ok=True)
+            # Save file temporarily
+            with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file:
+                tmp_file.write(content)
+                tmp_file_path = tmp_file.name
+            document_name = file.filename
+            document_path = tmp_file_path
+            # Load and split document from file
+            session.rag_service.load_and_split_document(
+                type=doc_type,
+                path=tmp_file_path
+            )
+        else:
+            # Handle URL upload
+            if not doc_type:
+                doc_type = "pdf"  # Default for URLs
+            document_name = url.split('/')[-1] or "URL Document"
+            document_url = url
+            # Load and split document from URL
+            session.rag_service.load_and_split_document(
+                type=doc_type,
+                url=url
+            )
+        # Create vector store
+        session.rag_service.create_vector_store()
+        # Update session state
+        session.document_uploaded = True
+        session.vector_store_created = True
+        session.document_info = {
+            "filename": document_name,
+            "type": doc_type,
+            "size": len(content) if file else 0,
+            "chunks_count": len(session.rag_service.chunks)
+        }
+        # Update session in database
+        session_manager.update_session_document(
+            session_id=session_id,
+            document_name=document_name,
+            document_type=doc_type,
+            chunks_count=len(session.rag_service.chunks),
+            pinecone_index=str(session.rag_service.index),
+            pinecone_namespace=session.rag_service.namespace,
+            document_path=document_path,
+            document_url=document_url
+        )
+        # Clean up temporary file if it exists
+        if document_path:
+            try:
+                os.unlink(document_path)
+            except:
+                pass
+        return UploadResponse(
+            session_id=session_id,
+            filename=document_name,
+            document_type=doc_type,
+            chunks_created=len(session.rag_service.chunks),
+            message="Document uploaded and processed successfully"
+        )
+    except Exception as e:
+        # Clean up temporary file if it exists
+        if 'document_path' in locals() and document_path:
+            try:
+                os.unlink(document_path)
+            except:
+                pass
+        raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
+@router.post("/query/{session_id}", response_model = QueryResponse)
+async def query_document(
+    session_id: str,
+    query_request: QueryRequest,
+    session: Session = Depends(get_session)
+):
+    """Query the uploaded Document"""
+    if not session.document_uploaded or not session.vector_store_created:
+        raise HTTPException(
+            status_code= 400,
+            detail="No docuement uploaded or processed for this session"
+        )
+    try:
+        # create query embedding
+        session.rag_service.create_query_embedding(query_request.query)
+        # retrive relevant docs
+        session.rag_service.retrive_documents()
+        # generate answer
+        answer = session.rag_service.answer_query(query_request.query)
+        # Extract query metadata and sources for UI
+        query_metadata = getattr(session.rag_service, 'query_metadata', {})
+        # Extract source documents from results
+        sources = []
+        if hasattr(session.rag_service, 'result') and session.rag_service.result:
+            matches = session.rag_service.result.get('matches', [])
+            for match in matches[:3]:  # Top 3 sources
+                metadata = match.get('metadata', {})
+                sources.append(SourceDocument(
+                    doc_id=metadata.get('doc_id', match.get('id', '')),
+                    page=metadata.get('page_no', metadata.get('page', 0)),
+                    text=metadata.get('text', ''),
+                    score=match.get('score', 0.0),
+                    metadata=metadata
+                ))
+        return QueryResponse(
+            session_id=session_id,
+            query=query_request.query,
+            answer=answer,
+            query_metadata=query_metadata,
+            sources=sources,
+            message="Query processed successfully"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
+@router.get("/session/{session_id}/status")
+async def get_session_status(
+    session_id: str,
+    session: Session = Depends(get_session)
+):
+    """Get session status and information"""
+    return {
+        "session_id": session_id,
+        "created_at": session.created_at,
+        "last_activity": session.last_activity,
+        "document_uploaded": session.document_uploaded,
+        "vector_store_created": session.vector_store_created,
+        "document_info": session.document_info
+    }

app/config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/config/config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pydantic_settings import BaseSettings
+from typing import Optional
+import os
+from dotenv import load_dotenv
+load_dotenv()
+class Settings(BaseSettings):
+    # API Settings
+    api_title: str = "RAG Document Analysis API"
+    api_version: str = "1.0.0"
+    # File Upload Settings
+    max_file_size: int = 50 * 1024 * 1024  # 50MB
+    allowed_file_types: list = [".pdf", ".docx", ".doc"]
+    upload_dir: str = "app/uploads"
+    # Session Settings
+    session_timeout_minutes: int = 60
+    database_path: str = os.getenv("DATABASE_PATH", "/tmp/claridoc_data/sessions.db")
+    # API Keys
+    gemini_api_key: Optional[str] = None
+    pinecone_api_key: Optional[str] = None
+    openai_api_key: Optional[str] = None
+    groq_api_key: Optional[str] = None
+    hf_token: Optional[str] = None
+    # Environment
+    environment: str = "development"
+    debug: bool = True
+    class Config:
+        env_file = ".env"
+def get_settings() -> Settings:
+    return Settings()

app/config/config.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+llm:
+  groq:
+    provider: "groq"
+    model_name: "openai/gpt-oss-20b"
+  gemini:
+    provider: "gemini"
+    model_name: "gemini-2.5-flash"
+  gemini_lite:
+    provider: "gemini_lite"
+    model_name: "gemini-2.5-flash-lite"
+embedding_model:
+  openai:
+    provider: "openai"
+    model_name: "text-embedding-3-small"
+  huggingface:
+    provider: "huggingface"
+    model_name: "mixedbread-ai/mxbai-embed-large-v1"

app/core/__init__.py ADDED Viewed

File without changes

app/core/session_manager.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import uuid
+from typing import Dict, Optional
+from datetime  import datetime, timedelta
+from app.services.RAG_service import RAGService
+from app.database.database import SessionDatabase
+from app.schemas.request_models import DocumentTypeSchema
+class Session:
+    def __init__(self, session_id:str):
+        self.session_id = session_id
+        self.created_at = datetime.now()
+        self.last_activity = datetime.now()
+        self.rag_service : Optional[RAGService] = None
+        self.document_uploaded = False
+        self.vector_store_created = False
+        self.document_info = {}
+        self.username = None
+    def update_activity(self):
+        self.last_activity = datetime.now()
+    def is_expired(self, timeout_minutes: int = 60) -> bool:
+        return datetime.now() - self.last_activity > timedelta(minutes=timeout_minutes)
+class SessionManager:
+    def __init__(self):
+        self.sessions: Dict[str, Session] = {}
+        self.db = SessionDatabase()
+    def create_session(self, username: str = "anonymous") -> str:
+        session_id  = str(uuid.uuid4())
+        session = Session(session_id)
+        session.username = username
+        self.sessions[session_id] = session
+        # Save to database
+        self.db.create_session(session_id, username)
+        return session_id
+    def get_session(self, session_id: str) -> Optional[Session]:
+        # First check in-memory sessions
+        if session_id in self.sessions:
+            session = self.sessions[session_id]
+            if not session.is_expired():
+                session.update_activity()
+                return session
+            else:
+                del self.sessions[session_id]
+        # If not in memory, try to restore from database
+        db_session = self.db.get_session(session_id)
+        if db_session and db_session['is_active']:
+            # Restore session to memory
+            session = Session(session_id)
+            session.username = db_session['username']
+            session.document_uploaded = db_session['chunks_count'] > 0
+            session.vector_store_created = db_session['pinecone_index'] is not None
+            session.document_info = {
+                'filename': db_session['document_name'],
+                'type': db_session['document_type'],
+                'chunks_count': db_session['chunks_count']
+            }
+            # Initialize RAG service if document exists (same as restore_session)
+            if session.vector_store_created:
+                print(f"[SessionManager] Restoring RAG service for session {session_id}")
+                session.rag_service = RAGService()
+                # Set the basic attributes
+                session.rag_service.index = db_session['pinecone_index']
+                session.rag_service.namespace = db_session['pinecone_namespace']
+                session.rag_service.Document_path = db_session['document_path']
+                session.rag_service.url = db_session['document_url']
+                # Create a mock splitter with the keywords file path for restored sessions
+                from app.ingestion.text_splitter import splitting_text
+                from app.utils.metadata_utils import MetadataService
+                # Recreate the document type information
+                metadataservice = MetadataService()
+                mock_scheme = DocumentTypeSchema(document_types="Insurance")  # Default for restored sessions
+                document_type = metadataservice.Return_document_model(mock_scheme)
+                session.rag_service.DocumentTypeScheme = mock_scheme
+                session.rag_service.Document_Type = document_type
+                # Create splitter instance to maintain the keywords file path
+                session.rag_service.splitter = splitting_text(documentTypeSchema=document_type, llm=session.rag_service.llm)
+                # Generate the expected keywords file path based on document name
+                import os
+                document_name = db_session['document_name'] or 'unknown'
+                keywords_filename = document_name.replace(".", "").replace("\\", "").replace("/", "") + ".json"
+                session.rag_service.splitter.Keywordsfile_path = os.path.join("app/data/", keywords_filename)
+                print(f"[SessionManager] RAG service restored with index: {session.rag_service.index}")
+            self.sessions[session_id] = session
+            return session
+        return None
+    def update_session_document(self, session_id: str, document_name: str,
+                               document_type: str, chunks_count: int,
+                               pinecone_index: str = None, pinecone_namespace: str = None,
+                               document_path: str = None, document_url: str = None):
+        """Update session with document information"""
+        self.db.update_session(
+            session_id,
+            document_name=document_name,
+            document_type=document_type,
+            chunks_count=chunks_count,
+            pinecone_index=pinecone_index,
+            pinecone_namespace=pinecone_namespace,
+            document_path=document_path,
+            document_url=document_url
+        )
+        # Update in-memory session if exists
+        if session_id in self.sessions:
+            session = self.sessions[session_id]
+            session.document_uploaded = True
+            session.vector_store_created = pinecone_index is not None
+            session.document_info = {
+                'filename': document_name,
+                'type': document_type,
+                'chunks_count': chunks_count
+            }
+    def get_user_sessions(self, username: str):
+        """Get all sessions for a user"""
+        return self.db.get_user_sessions(username)
+    def restore_session(self, session_id: str) -> bool:
+        """Restore a session from database"""
+        db_session = self.db.get_session(session_id)
+        if db_session and db_session['is_active']:
+            session = Session(session_id)
+            session.username = db_session['username']
+            session.document_uploaded = db_session['chunks_count'] > 0
+            session.vector_store_created = db_session['pinecone_index'] is not None
+            session.document_info = {
+                'filename': db_session['document_name'],
+                'type': db_session['document_type'],
+                'chunks_count': db_session['chunks_count']
+            }
+            # Initialize RAG service if document exists
+            if session.vector_store_created:
+                print(f"[SessionManager] Restoring RAG service for session {session_id}")
+                session.rag_service = RAGService()
+                # Set the basic attributes
+                session.rag_service.index = db_session['pinecone_index']
+                session.rag_service.namespace = db_session['pinecone_namespace']
+                session.rag_service.Document_path = db_session['document_path']
+                session.rag_service.url = db_session['document_url']
+                # Create a mock splitter with the keywords file path for restored sessions
+                from app.ingestion.text_splitter import splitting_text
+                from app.utils.metadata_utils import MetadataService
+                # Recreate the document type information
+                metadataservice = MetadataService()
+                mock_scheme = DocumentTypeSchema(document_types="Insurance")  # Default for restored sessions
+                document_type = metadataservice.Return_document_model(mock_scheme)
+                session.rag_service.DocumentTypeScheme = mock_scheme
+                session.rag_service.Document_Type = document_type
+                # Create splitter instance to maintain the keywords file path
+                session.rag_service.splitter = splitting_text(documentTypeSchema=document_type, llm=session.rag_service.llm)
+                # Generate the expected keywords file path based on document name
+                import os
+                document_name = db_session['document_name'] or 'unknown'
+                keywords_filename = document_name.replace(".", "").replace("\\", "").replace("/", "") + ".json"
+                session.rag_service.splitter.Keywordsfile_path = os.path.join("app/data/", keywords_filename)
+                print(f"[SessionManager] RAG service restored with index: {session.rag_service.index}")
+            self.sessions[session_id] = session
+            return True
+        return False
+    def delete_session(self, session_id:str):
+        if session_id in self.sessions:
+            del self.sessions[session_id]
+        self.db.deactivate_session(session_id)
+    def cleanup_expired_sessions(self):
+        expired_sessions = [
+            sid for sid, session in self.sessions.items()
+            if session.is_expired()
+        ]
+        for sid in expired_sessions:
+            del self.sessions[sid]
+session_manager = SessionManager()

app/data/__init__.py ADDED Viewed

File without changes

app/database/database.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import sqlite3
+import json
+from datetime import datetime
+from typing import List, Dict, Optional
+import os
+from pathlib import Path
+from app.config.config import get_settings
+class SessionDatabase:
+    def __init__(self):
+        settings = get_settings()
+        self.db_path = settings.database_path
+        # Ensure the directory exists and is writable
+        db_dir = Path(self.db_path).parent
+        try:
+            db_dir.mkdir(parents=True, exist_ok=True)
+            # Test if directory is writable
+            test_file = db_dir / "test_write.tmp"
+            test_file.touch()
+            test_file.unlink()
+        except Exception as e:
+            # Fallback to a different writable location
+            import tempfile
+            fallback_dir = Path(tempfile.gettempdir()) / "claridoc"
+            fallback_dir.mkdir(exist_ok=True)
+            self.db_path = str(fallback_dir / "sessions.db")
+            print(f"Database path not writable, using fallback: {self.db_path}")
+        self.init_db()
+    def init_db(self):
+        """Initialize the database with required tables"""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                conn.execute("""
+                    CREATE TABLE IF NOT EXISTS sessions (
+                        session_id TEXT PRIMARY KEY,
+                        username TEXT NOT NULL,
+                        created_at TEXT NOT NULL,
+                        last_accessed TEXT NOT NULL,
+                        document_uploaded BOOLEAN DEFAULT FALSE,
+                        vector_store_created BOOLEAN DEFAULT FALSE,
+                        document_info TEXT DEFAULT '{}',
+                        rag_service_data TEXT DEFAULT '{}'
+                    )
+                """)
+                conn.execute("""
+                    CREATE TABLE IF NOT EXISTS users (
+                        username TEXT PRIMARY KEY,
+                        created_at TEXT NOT NULL,
+                        last_login TEXT NOT NULL
+                    )
+                """)
+                conn.commit()
+                print(f"Database initialized successfully at: {self.db_path}")
+        except Exception as e:
+            print(f"Failed to initialize database: {e}")
+            raise
+    def create_session(self, session_id: str, username: str):
+        """Create a new session in the database"""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                now = datetime.now().isoformat()
+                # Insert or update user
+                conn.execute("""
+                    INSERT OR REPLACE INTO users (username, created_at, last_login)
+                    VALUES (?, COALESCE((SELECT created_at FROM users WHERE username = ?), ?), ?)
+                """, (username, username, now, now))
+                # Insert session
+                conn.execute("""
+                    INSERT INTO sessions (session_id, username, created_at, last_accessed)
+                    VALUES (?, ?, ?, ?)
+                """, (session_id, username, now, now))
+                conn.commit()
+        except Exception as e:
+            print(f"Failed to create session: {e}")
+            raise
+    def get_user_sessions(self, username: str) -> List[Dict]:
+        """Get all sessions for a user"""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                conn.row_factory = sqlite3.Row
+                cursor = conn.execute("""
+                    SELECT * FROM sessions
+                    WHERE username = ?
+                    ORDER BY last_accessed DESC
+                """, (username,))
+                sessions = []
+                for row in cursor.fetchall():
+                    document_info = json.loads(row['document_info'] or '{}')
+                    sessions.append({
+                        'session_id': row['session_id'],
+                        'created_at': row['created_at'],
+                        'last_accessed': row['last_accessed'],
+                        'document_uploaded': bool(row['document_uploaded']),
+                        'vector_store_created': bool(row['vector_store_created']),
+                        'document_name': document_info.get('filename', 'Untitled Document'),
+                        'document_type': document_info.get('type', 'Unknown'),
+                        'chunks_count': document_info.get('chunks_count', 0)
+                    })
+                return sessions
+        except Exception as e:
+            print(f"Failed to get user sessions: {e}")
+            return []
+    def get_session(self, session_id: str) -> Optional[Dict]:
+        """Get session data by ID"""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                conn.row_factory = sqlite3.Row
+                cursor = conn.execute("""
+                    SELECT * FROM sessions WHERE session_id = ?
+                """, (session_id,))
+                row = cursor.fetchone()
+                if row:
+                    return {
+                        'session_id': row['session_id'],
+                        'username': row['username'],
+                        'created_at': row['created_at'],
+                        'last_accessed': row['last_accessed'],
+                        'document_uploaded': bool(row['document_uploaded']),
+                        'vector_store_created': bool(row['vector_store_created']),
+                        'document_info': json.loads(row['document_info'] or '{}'),
+                        'rag_service_data': json.loads(row['rag_service_data'] or '{}')
+                    }
+                return None
+        except Exception as e:
+            print(f"Failed to get session: {e}")
+            return None
+    def update_session(self, session_id: str, **kwargs):
+        """Update session data"""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                # Always update last_accessed
+                kwargs['last_accessed'] = datetime.now().isoformat()
+                # Convert dictionaries to JSON strings
+                if 'document_info' in kwargs:
+                    kwargs['document_info'] = json.dumps(kwargs['document_info'])
+                if 'rag_service_data' in kwargs:
+                    kwargs['rag_service_data'] = json.dumps(kwargs['rag_service_data'])
+                # Build dynamic query
+                set_clause = ", ".join(f"{key} = ?" for key in kwargs.keys())
+                values = list(kwargs.values()) + [session_id]
+                conn.execute(f"""
+                    UPDATE sessions
+                    SET {set_clause}
+                    WHERE session_id = ?
+                """, values)
+                conn.commit()
+        except Exception as e:
+            print(f"Failed to update session: {e}")
+            raise
+    def delete_session(self, session_id: str):
+        """Delete a session"""
+        try:
+            with sqlite3.connect(self.db_path) as conn:
+                conn.execute("DELETE FROM sessions WHERE session_id = ?", (session_id,))
+                conn.commit()
+        except Exception as e:
+            print(f"Failed to delete session: {e}")
+            raise

app/database/sessions.db ADDED Viewed

Binary file (32.8 kB). View file

app/embedding/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/embedding/embeder.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+from dotenv import load_dotenv
+from typing import Union, List
+class QueryEmbedding:
+    def __init__(self, query: str, embedding_model):
+        self.query = query
+        self.embedding_model = embedding_model
+    def get_embedding(self):
+        e_main = self.embedding_model.embed_query(self.query)
+        return e_main

app/embedding/vectore_store.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from dotenv import load_dotenv
+from pinecone import Pinecone
+from pinecone import ServerlessSpec
+from langchain_pinecone import PineconeVectorStore
+from datetime import datetime
+from uuid import uuid4
+class VectorStore:
+    def __init__(self, text_chunks, embedding_model):
+        self.text_chunks = text_chunks
+        self.current_time = datetime.now()
+        self.embedding_model = embedding_model
+    def create_vectorestore(self):
+        load_dotenv()
+        pinecone_key = os.getenv("PINECONE_API_KEY")
+        pc = Pinecone(api_key=pinecone_key)
+        # pc._vector_api.api_client.pool_threads = 1
+        time_string = self.current_time.strftime("%Y-%m-%d-%H-%M")
+        index_name = f"rag-project"
+        if not pc.has_index(index_name):
+            pc.create_index(
+                name = index_name,
+                dimension=1536,
+                metric="cosine",
+                spec = ServerlessSpec(cloud="aws", region="us-east-1")
+            )
+        index = pc.Index(index_name)
+        # model_loader = ModelLoader(model_provider="openai")
+        # embedding_model = model_loader.load_llm()
+        uuids = [str(uuid4()) for _ in range(len(self.text_chunks)) ]
+        vector_store = PineconeVectorStore(index = index, embedding=self.embedding_model)
+        name_space = f"hackrx-index{time_string}"
+        vector_store.add_documents(documents=self.text_chunks, ids = uuids,namespace = name_space )
+        return index, name_space

app/ingestion/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/ingestion/file_loader.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import requests
+from langchain_community.document_loaders import PyMuPDFLoader, Docx2txtLoader
+import os
+import tempfile
+from app.schemas.request_models import DocumentTypeSchema
+from langchain_core.documents import Document
+from typing import List
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import PydanticOutputParser
+from app.schemas.request_models import  DocumentTypeSchema
+class FileLoader:
+    def __init__(self, llm=None):
+        self.llm = llm
+    def detect_document_type(self, documents: List[Document]) -> DocumentTypeSchema:
+        """Detect the genre of document by reading first 2 page content by llm."""
+        document_content = " ".join([doc.page_content for doc in documents])
+        parser = PydanticOutputParser(pydantic_object=DocumentTypeSchema)
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", "You are a legal/HR/financial document classifier."),
+            ("human", """
+            You will be given the first 2 pages of a document.
+            Classify it into one of the following categories:
+            - HR/Employment
+            - Insurance
+            - Legal/Compliance
+            - Financial/Regulatory
+            - Healthcare
+            Respond strictly in JSON that matches the schema.
+            {format_instructions}
+            Document content:
+            {document_content}
+            """),
+        ])
+        chain = prompt | self.llm | parser
+        result: DocumentTypeSchema = chain.invoke({
+            "document_content": document_content,
+            "format_instructions": parser.get_format_instructions()
+        })
+        return result
+    def load_documents_from_url(self, url: str) -> List[Document]:
+        response = requests.get(url)
+        response.raise_for_status()
+        content_type = response.headers.get('Content-Type', '')
+        if content_type == 'application/pdf':
+            tmp_file_path = self._save_temp_file(response.content, ".pdf")
+            return self.load_pdf(tmp_file_path)
+        else:
+            raise ValueError("File type not supported, expected a PDF.")
+    def load_pdf(self, path: str) -> List[Document]:
+        """Load PDF from a local path and return its content."""
+        self._validate_file_exists(path)
+        loader = PyMuPDFLoader(path)
+        return loader.load()
+    def load_word_document(self, path: str) -> List[Document]:
+        """Load Word document from a local path and return its content."""
+        self._validate_file_exists(path)
+        try:
+            docx_loader = Docx2txtLoader(path)
+            return docx_loader.load()
+        except Exception as e:
+            print(e)
+            return []
+    def _save_temp_file(self, content: bytes, suffix: str) -> str:
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
+            tmp_file.write(content)
+            return tmp_file.name
+    def _validate_file_exists(self, path: str):
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"The file {path} does not exist.")

app/ingestion/text_splitter.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from langchain_core.documents import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from uuid import uuid4
+from typing import List, Dict
+import os
+import json
+from app.utils.metadata_utils import MetadataService
+from app.metadata_extraction.metadata_ext import MetadataExtractor
+from pydantic import BaseModel
+from typing import Type
+from app.utils.metadata_utils import MetadataService
+class splitting_text:
+    def __init__(self, documentTypeSchema:Type[BaseModel], llm=None):
+        self.llm = llm
+        self.metadata_extractor = MetadataExtractor(llm = self.llm)
+        self.metadata_services = MetadataService()
+        self.documentTypeSchema = documentTypeSchema
+        self.Keywordsfile_path = None
+    def _clean_text(self, text:str)-> str:
+        """Clean extracted page content"""
+        # remove excessive whitespace
+        text = " ".join(text.split())
+        return text
+    def text_splitting(self, doc: List[Document]) -> List[Document]:
+        """Split document into chunks for processing"""
+        all_chunks = []
+        splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
+        for i, page in enumerate(doc):
+                    # reset per page
+            try:
+                text = page.get_text()
+            except:
+                text = page.page_content
+            # print(type(page))
+                    # text = self._clean_text(text)
+            if i == 0:
+                output_folder = "app/data/"
+                filename = page.metadata['source'].replace(".","").replace("\\","")+ ".json"
+                output_path = os.path.join(output_folder, filename)
+                self.Keywordsfile_path = output_path
+                # First page → extract + create JSON
+                Document_metadata = self.metadata_extractor.extractMetadata(document=page, known_keywords={}, metadata_class=self.documentTypeSchema)
+                extracted = Document_metadata.model_dump()
+                normalized = MetadataService.normalize_dict_to_lists(metadata = extracted)
+                with open(output_path, "w") as f:
+                    json.dump(normalized, f, indent=4)
+                known_keywords = normalized
+            else:
+                # Next pages → load JSON and enforce consistency
+                with open(self.Keywordsfile_path, "r") as f:
+                    known_keywords = json.load(f)
+                Document_metadata = self.metadata_extractor.extractMetadata(document=page, known_keywords=known_keywords, metadata_class=self.documentTypeSchema)
+                # check if there is new keyword is added or not during metadata extraction if yes then normalise(convert to dict) and then add new values into the keys exist
+                if Document_metadata.added_new_keyword:
+                    new_data = self.metadata_services.normalize_dict_to_lists(
+                    Document_metadata.model_dump(exclude_none= True)
+                )
+                    for key,vals in new_data.items():
+                        if isinstance(vals,list):
+                            known_keywords[key] = list(set(known_keywords.get(key,[]) + vals))  #get the existing key and add vals and convert into set then list and update the file.
+                    with open(self.Keywordsfile_path, "w") as f:
+                        json.dump(known_keywords, f, indent=4)
+            # print(f"Document_metadata type: {type(Document_metadata)}")
+            extracted_metadata = Document_metadata.model_dump(exclude_none=True)
+            # print(f"extracted_metadata type: {type(extracted_metadata)}")
+            print(f"doc number: {i}")
+            if text.strip():
+                uuid = str(uuid4())
+                temp_doc = Document(
+                    page_content=text,
+                    metadata={
+                        **page.metadata,
+                        **extracted_metadata,
+                        "page_no": i,
+                        "doc_id": uuid,
+                        "chunk_id": f"{uuid}_p{i}",
+                        "type": "text"
+                    }
+                )
+                chunks = splitter.split_documents([temp_doc])
+                all_chunks.extend(chunks)
+        return all_chunks

app/main.py ADDED Viewed

File without changes

app/metadata_extraction/__init__.py ADDED Viewed

File without changes

app/metadata_extraction/metadata_ext.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import json
+from langchain_core.exceptions import OutputParserException
+from langchain_core.documents import Document
+from langchain_core.output_parsers import PydanticOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from typing import Type
+from pydantic import BaseModel
+# wrap parser with fixer once
+# pydantic_parser = PydanticOutputParser(pydantic_object=InsuranceMetadata)
+# fixing_parser = OutputFixingParser.from_llm(llm=llm, parser=pydantic_parser)
+class MetadataExtractor:
+    def __init__(self, llm = None):
+        self.llm = llm
+    def extractMetadata_query(self, metadata_class : Type[BaseModel],document: Document, known_keywords: dict) -> BaseModel:
+        parser = PydanticOutputParser(pydantic_object=metadata_class)
+        schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
+        keywords_str = json.dumps(known_keywords, indent=2)
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", """You are an information extraction system.
+            Extract only the required metadata from the user query using the existing known keywords.
+            ⚠️ CRITICAL FORMATTING RULES:
+            - ALL fields must be arrays/lists, even if there's only one value
+            - For single values, wrap in brackets: "doc_id": ["single_value"]
+            - For multiple values: "coverage_type": ["value1", "value2", "value3"]
+            - For null/empty fields, use: null (not empty arrays)
+            ⚠️ Content Rules:
+            - For exclusions and obligations, DO NOT copy full sentences.
+            - Instead, extract only concise normalized keywords (2–5 words max each).
+            - Use existing keywords if they already exist in the provided list.
+            - Prefer to reuse existing keywords if they are semantically the same.
+            - If you find a new keyword that is a sub-type or more specific variant of an existing one, keep both:
+            reuse the closest match from existing keywords, and also add the new one.
+            - In that case, set added_new_keyword=true.
+            - Do not include raw paragraphs in the output.
+            Schema you must follow:
+            {schema}
+            Existing Keywords:
+            {keywords}
+            """),
+            ("human", "Text:\n{document_content}")
+        ])
+        chain = prompt | self.llm | parser
+        try:
+            result = chain.invoke({
+                "schema": schema_str,
+                "keywords": keywords_str,
+                "document_content": document.page_content
+            })
+            return result
+        except OutputParserException as e:
+            print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
+            return metadata_class(added_new_keyword=False)
+    def extractMetadata(self, metadata_class : Type[BaseModel], document: Document, known_keywords: dict) -> BaseModel:
+        parser = PydanticOutputParser(pydantic_object=metadata_class)
+        schema_str = json.dumps(metadata_class.model_json_schema(), indent=2)
+        keywords_str = json.dumps(known_keywords, indent=2)
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", """You are an information extraction system.
+            Extract only the required metadata from the text according to schema given below.
+            ⚠️ CRITICAL FORMATTING RULES:
+            - ALL fields must be arrays/lists, even if there's only one value
+            - For single values, wrap in brackets: "doc_id": ["single_value"]
+            - For multiple values: "coverage_type": ["value1", "value2", "value3"]
+            - For null/empty fields, use: null (not empty arrays)
+            ⚠️ Content Rules:
+            - For exclusions and obligations, DO NOT copy full sentences.
+            - Instead, extract only concise normalized keywords (2–5 words max each).
+            - Use existing keywords if they already exist in the provided list.
+            - Prefer to reuse existing keywords if they are semantically the same.
+            - If you find a new keyword that is a **sub-type** or **more specific variant** of an existing one, keep both:
+            *reuse the closest match from existing keywords*, and also add the new one.
+            - In that case, set `added_new_keyword=true`.
+            - Do not include raw paragraphs in the output.
+            Schema you must follow:
+            {schema}
+            Existing Keywords:
+            {keywords}
+            """),
+            ("human", "Text:\n{document_content}")
+        ])
+        # - Use existing keywords if they already exist in the provided list.
+        #     - Only create a new keyword if absolutely necessary, and set `added_new_keyword=true`.
+        #     - New keywords must be short (1–3 words).
+        #     - Do NOT invent different variations (e.g., if "Medical" already exists, do not output "Mediclaim Plus").
+        #     - For list fields (like exclusions), reuse existing keywords where possible.
+        chain = prompt | self.llm | parser
+        try:
+            result = chain.invoke({
+                "schema": schema_str,
+                "keywords": keywords_str,
+                "document_content": document.page_content
+            })
+            return result
+        except OutputParserException as e:
+            print(f"⚠️ Parser failed on doc {document.metadata.get('source')} | error: {e}")
+            return metadata_class(added_new_keyword=False)   # instantiate fallback

app/prompts/__init__.py ADDED Viewed

File without changes

app/prompts/prompts.py ADDED Viewed

	@@ -0,0 +1,7 @@

+PARSER_PROMPT = f"""You receive a user's question about an insurance/contract document. Produce a JSON with keys:
+- intent (one of: coverage_check, definition, limit_query, waiting_period, exclusions, other)
+- entities (map of entity_name -> canonical string)
+- constraints (map: plan, time_window, eligible_person, numerical_constraints)
+- answer_type (one of: yes_no, short_explain, detailed, clause_list)
+Return ONLY the JSON.Make sure that nested fields like "entities" and "constraints" are JSON objects, not strings.
+"""

app/reseasoning/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/reseasoning/descision_maker.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from app.schemas.request_models import QuerySpec, LogicResult
+def evaluate_with_llm(raw_query: str, top_clauses: list, llm):
+    """
+    Use the LLM to analyze retrieved clauses and return structured decision.
+    """
+    # Prepare context for the prompt
+    context_clauses = []
+    for i, c in enumerate(top_clauses, 1):
+        context_clauses.append(f"{i}) [source:{c.doc_id} page:{c.page}] {c.text}")
+    print(chr(10).join(context_clauses))
+    # Build prompt
+    prompt = f"""
+        You are an insurance policy analyst. Question: "{raw_query}"
+        Provided clauses (numbered):
+        {chr(10).join(context_clauses)}
+        Task:
+        1) Decide: COVERED / NOT_COVERED / CONDITIONAL
+        2) Summarize the exact clause(s) that justify your decision.
+        3) List any conditions, waiting periods, sublimits, or exclusions relevant.
+        4) Provide a concise final answer (1-2 sentences).
+        Return JSON with these exact keys:
+        {{
+            "decision": "...",
+            "evidence": [
+                {{"doc_id": "...", "page": 0, "snippet": "...", "reason": "..."}}
+            ],
+            "confidence": 0.0,
+            "rationale": "...",
+            "answer": "..."
+        }}
+        """
+    # Directly parse to LogicResult using structured output
+    structured_llm = llm.with_structured_output(LogicResult)
+    result: LogicResult = structured_llm.invoke(prompt)
+    # print(f"result: {result}\n result_type{type(result)}")
+    # Attach full text for each evidence
+    enriched_evidence = []
+    for ev in result.evidence:
+        matched = next((c for c in top_clauses if c.doc_id == ev.doc_id and str(c.page) == str(ev.page)), None)
+        if matched:
+            ev.text = matched.text  # or use a different field if needed
+        enriched_evidence.append(ev)
+    result.evidence = enriched_evidence
+    # print(enriched_evidence[0])
+    return result

app/reseasoning/query_parser.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from app.utils.model_loader import ModelLoader
+from app.schemas.request_models import QuerySpec
+from app.prompts.prompts import PARSER_PROMPT
+def parsing_query(query:str, llm) -> QuerySpec:
+    # Bind the schema to the model
+    # model_loader = ModelLoader(model_provider = "gemini")
+    # llm = model_loader.load_llm()
+    structured_llm = llm.with_structured_output(QuerySpec)
+    # Compose the full prompt with instructions and user question
+    full_prompt = PARSER_PROMPT + "\n" + query
+    # Invoke the model to get structured output parsed as QuerySpec
+    result: QuerySpec = structured_llm.invoke(full_prompt)
+    return result
+    # print(result.json())  # This will print the JSON output matching your schema

app/retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/retrieval/reranker.py ADDED Viewed

File without changes

app/retrieval/retriever.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# from app.schemas.request_models import ClauseHit
+class Retriever:
+    def __init__(self, pinecone_index, query = None, metadata = None, namespace=None):
+        self.pinecone_index = pinecone_index
+        self.query = query
+        self.metadata = metadata
+        self.namespace = namespace
+    def retrieval_from_pinecone_vectoreStore(self, top_k= 3):
+        """
+        Retrieve the top matching chunks from Pinecone.
+        Args:
+            pinecone_index: Your Pinecone index object.
+            embedding: The vector embedding of the query.
+            top_k: How many chunks to retrieve.
+            filter_meta: Optional metadata filter dict.
+        Returns:
+            List of ClauseHit objects (lightweight container for chunk info).
+        """
+        res = self.pinecone_index.query(
+            vector= self.query,
+            top_k =top_k ,
+            include_metadata = True,
+            include_values = False,
+            filter = self.metadata,
+            namespace = self.namespace
+            )
+        # Process the results into the expected format
+        # class ClauseHit:
+        #     def __init__(self, doc_id, page, chunk_id, text, metadata, score):
+        #         self.doc_id = doc_id
+        #         self.page = page
+        #         self.chunk_id = chunk_id
+        #         self.text = text
+        #         self.metadata = metadata
+        #         self.score = score
+        # hits = []
+        # for match in res['matches']:
+        #     hits.append(ClauseHit(
+        #         doc_id=match['metadata'].get('doc_id', ''),
+        #         page=match['metadata'].get('page_no', -1),  # Use page_no instead of page
+        #         chunk_id=match['metadata'].get('chunk_id', ''),
+        #         text=match['metadata'].get('text', match.get('text', '')),
+        #         metadata=match['metadata'],
+        #         score=match['score']
+        #     ))
+        # return hits
+        return res

app/schemas/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/schemas/metadata_schema.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from pydantic import BaseModel, model_validator,field_validator, HttpUrl, Field
+from typing import List, Dict, Any, Optional, Union, Literal
+class CommonMetaData(BaseModel):
+    # --- Common metadata (across all domains) ---
+    doc_id: Optional[List[str]] = Field(None, description="Unique document identifier")
+    doc_category: Optional[List[str]] = Field(None, description="General pool/category e.g. Insurance, HR, Legal")
+    doc_type: Optional[List[str]] = Field(None, description="Specific type e.g. Policy doc, Contract, Handbook")
+    jurisdiction: Optional[List[str]] = Field(
+    default=None, description="Applicable jurisdictions/regions/countries"
+    )
+    effective_date: Optional[List[str]] = Field(None, description="Date from which the document is effective")
+    expiry_date: Optional[List[str]] = Field(None, description="Date until which the document is valid")
+    parties: Optional[List[str]] = Field(None, description="Involved parties (e.g., employer/employee, insurer/insured)")
+    # obligations: Optional[List[str]] = Field(
+    #     default=None,
+    #     description="List of short, normalized obligation keywords (2–5 words each, no full sentences)"
+    # )
+    penalties: Optional[List[str]] = Field(None, description="Penalties/non-compliance consequences")
+    notes: Optional[List[str]] = Field(None, description="Freeform additional metadata")
+class InsuranceMetadata(CommonMetaData):
+    # --- Insurance ---
+    policy_number: Optional[List[str]] = None
+    coverage_type: Optional[List[str]] = Field(
+    default=None,
+    description="Type(s) of coverage. Short keywords (1–3 words each)."
+    )
+    premium_amount: Optional[List[str]] = None
+    exclusions: Optional[List[str]] = Field(
+        description="List of normalized keywords representing exclusions (short, 2-5 words each, not full paragraphs).", default=None
+    )
+    added_new_keyword: bool = False
+class HRMetadata(CommonMetaData):
+    # --- HR / Employment ---
+    policy_type: Optional[str] = None
+    applicable_roles: Optional[List[str]] = None
+    notice_period: Optional[str] = None
+class LegalMetadata(CommonMetaData):
+    # --- Legal / Compliance ---
+    clause_type: Optional[str] = None
+    governing_law: Optional[str] = None
+    duration: Optional[str] = None
+class FinancialMetadata(CommonMetaData):
+    # --- Financial / Regulatory ---
+    section: Optional[str] = None
+    compliance_requirement: Optional[str] = None
+    reporting_frequency: Optional[str] = None
+class HealthcareMetadata(CommonMetaData):
+    # --- Healthcare / Pharma ---
+    disease: Optional[str] = None
+    treatment_limit: Optional[str] = None
+    validity_period: Optional[str] = None
+class ProcurementMetadata(CommonMetaData):
+    # --- Procurement / Vendor Management ---
+    vendor_name: Optional[str] = None
+    contract_value: Optional[str] = None
+    payment_terms: Optional[str] = None
+    sla_metrics: Optional[List[str]] = None

app/schemas/request_models.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from pydantic import BaseModel, model_validator,field_validator, HttpUrl, Field
+from typing import List, Dict, Any, Optional, Union, Literal
+import json
+class QueryRequest(BaseModel):
+    query: str
+# class QuerySpec(BaseModel):
+#     raw_query: str # query of the user
+#     intent: str # High-level purpose, e.g., "coverage_check" — helps routing aur rules.
+#     entities: Dict[str, Union[str, List[str]]] = Field(default_factory= dict) # Extracted items (policy number, dates, amounts) — structured
+#     constraints : Dict[str, Any] = Field(default_factory=dict) # filters like {"jurisdiction":"IN","incident_date":"2024-01-01"}
+#     answer_type: Optional[str] = "detailed"
+#     followups: Optional[List[str]] = Field(default_factory=list) # followups for user
+#     @model_validator(mode = "before")
+#     @classmethod
+#     def parse_nested_json(cls, values): # parsing nested json to load
+#         for field in ['entities', 'constraints']:
+#             val = values.get(field)
+#             if isinstance(val, str):
+#                 try:
+#                     values[field] = json.loads(val)
+#                 except json.JSONDecodeError:
+#                     pass
+#         return values
+# class ClauseHit(BaseModel):
+#     doc_id : str # id of the document
+#     page: int # pdf page id
+#     chunk_id: str
+#     text: str # Evidence text used for answer.
+#     metadata: Dict[str, Any] = Field(default_factory=dict) # metadata
+#     score: float  # Retrieval similarity score
+#     boost: Optional[float] = None
+#     combined_score: Optional[float] = None
+#     @field_validator("metadata", mode="before")
+#     def parse_metadata(cls, v):
+#         if isinstance(v, str):
+#             try:
+#                 return json.loads(v) if v.strip() else {}
+#             except json.JSONDecodeError:
+#                 return {}
+#         return v
+# class LogicResult(BaseModel):
+#     answer: str
+#     decision: str # "covered"/"not_covered"/"conditional"
+#     confidence: float # 0..1 score for calibration/thresholding.
+#     evidence: List[ClauseHit]  = Field(default_factory=list) # List of ClauseHit used to justify the answer.
+#     rationale: Optional[str] = None # Short human-readable reason (audit-friendly).
+# class HackRxRunRequest(BaseModel):
+#     documents: HttpUrl = Field(
+#         ...,
+#         description="URL to the document (PDF, DOCX, or email blob)"
+#     )
+#     questions: List[str] = Field(
+#         ...,
+#         description="List of questions to query against the document"
+#     )
+class DocumentTypeSchema(BaseModel):
+    document_types: Literal[
+        "HR/Employment",
+        "Insurance",
+        "Legal/Compliance",
+        "Financial/Regulatory",
+        "Government/Public Policy",
+        "Technical/IT Policies"
+    ] = Field(..., description="The category/type of the document")

app/schemas/response_models.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from pydantic import BaseModel
+from typing import List, Dict, Any, Optional
+class SourceDocument(BaseModel):
+    doc_id: str
+    page: int
+    text: str
+    score: float
+    metadata: Dict[str, Any]
+class QueryResponse(BaseModel):
+    session_id: str
+    query: str
+    answer: str
+    query_metadata: Optional[Dict[str, Any]] = None
+    sources: Optional[List[SourceDocument]] = None
+    message: str
+class SessionResponse(BaseModel):
+    session_id: str
+    message: str
+class UploadResponse(BaseModel):
+    session_id: str
+    filename: str
+    document_type: str
+    chunks_created: int
+    message: str
+class ErrorResponse(BaseModel):
+    detail: str
+    error_code: Optional[str] = None

app/services/RAG_service.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import List
+from app.utils.model_loader import ModelLoader
+from app.ingestion.file_loader import FileLoader
+from app.ingestion.text_splitter import splitting_text
+from app.retrieval.retriever import Retriever
+from app.embedding.embeder import QueryEmbedding
+from app.embedding.vectore_store import VectorStore
+from app.metadata_extraction.metadata_ext import MetadataExtractor
+from app.utils.metadata_utils import MetadataService
+# from app.utils.document_op import DocumentOperation
+from langchain_core.documents import Document
+import json
+from typing import List, Optional
+# ...existing imports...
+# Global model instances (loaded once)
+_embedding_model = None
+def get_models():
+    global  _embedding_model
+    if _embedding_model is None:
+        print("Loading models (one-time initialization)...")
+        embedding_loader = ModelLoader(model_provider="huggingface")
+        _embedding_model = embedding_loader.load_llm()
+    return _embedding_model
+class RAGService:
+    def __init__(self):
+        print("[RAGService] Initializing service...")
+        self._init_models()
+        self.Docuement_Type = None
+        self.Pinecone_index = None
+        self.Document_path = None
+        self.Document_Type = None
+        self.DocumentTypeScheme = None
+        self.url = None
+        self.chunks = None
+        self.vector_store = None
+        self.index = None
+        self.namespace = None
+        self.retriever = None
+        self.metadataservice = MetadataService()
+        print("[RAGService] Initialization complete.")
+    def _init_models(self):
+        """Initialize LLM and embedding Models"""
+        print("[RAGService] Loading LLM model (gemini)...")
+        self.model_loader = ModelLoader(model_provider="gemini")
+        self.llm = self.model_loader.load_llm()
+        print("[RAGService] LLM model loaded.")
+        print("[RAGService] Loading embedding model (huggingface)...")
+        # self.model_loader = ModelLoader(model_provider="huggingface")
+        self.embedding_model = get_models()
+        print("[RAGService] Embedding model loaded.")
+    def load_and_split_document(self, type:str, path:str= None, url:str = None):
+        """Load and chunk document from local path or URL"""
+        print(f"[RAGService] Loading document. Type: {type}, Path: {path}, URL: {url}")
+        file_loader = FileLoader(llm = self.llm)
+        if type == "pdf":
+            if path:
+                print(f"[RAGService] Loading PDF from path: {path}")
+                doc = file_loader.load_pdf(path)
+            elif url:
+                print(f"[RAGService] Loading PDF from URL: {url}")
+                doc = file_loader.load_documents_from_url(url)
+            else:
+                print("[RAGService] Error: Either path or url must be provided for PDF.")
+                raise ValueError("Either path or url must be provided for PDF.")
+        elif type == "word":
+            if path:
+                print(f"[RAGService] Loading Word document from path: {path}")
+                doc = file_loader.load_word_document(path)
+            elif url:
+                print("[RAGService] Error: URL loading not supported for Word documents.")
+                raise ValueError("URL loading not supported for Word documents.")
+            else:
+                print("[RAGService] Error: Path must be provided for Word document.")
+                raise ValueError("Path must be provided for Word document.")
+        else:
+            print("[RAGService] Error: Unsupported document type.")
+            raise ValueError("Unsupported document type. Use 'pdf' or 'word'.")
+        print("[RAGService] Detecting document type scheme...")
+        self.DocumentTypeScheme = file_loader.detect_document_type(doc[0:2])
+        print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
+        self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
+        print(f"[RAGService] Document type model: {self.Document_Type}")
+        self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm)
+        print("[RAGService] Splitting document into chunks...")
+        self.chunks = self.splitter.text_splitting(doc)
+        print(f"[RAGService] Total chunks created: {len(self.chunks)}")
+    def create_query_embedding(self, query: str):
+        print("[RAGService] Creating query embedding...")
+        self.query_embedder = QueryEmbedding(query=query, embedding_model=self.embedding_model)
+        self.query_embedding = self.query_embedder.get_embedding()
+        print(f"[RAGService] Query embedding created: {self.query_embedding}")
+        langchain_doc = Document(page_content=query)
+        print("[RAGService] Extracting metadata for the query...")
+        self.metadataExtractor = MetadataExtractor(llm=self.llm)
+        with open(self.splitter.Keywordsfile_path, "r") as f:
+            known_keywords = json.load(f)
+        raw_metadata = self.metadataExtractor.extractMetadata_query(self.Document_Type,langchain_doc, known_keywords = known_keywords)
+        print(f"[RAGService] Query metadata extracted: {raw_metadata}")
+        # Convert to dictionary and format for Pinecone
+        metadata_dict = raw_metadata.model_dump(exclude_none=True)
+        formatted_metadata = self.metadataservice.format_metadata_for_pinecone(metadata_dict)
+        # Remove problematic fields that cause serialization issues
+        self.query_metadata = {
+            k: v for k, v in formatted_metadata.items()
+            if k not in ["obligations", "exclusions", "notes", "added_new_keyword"]
+        }
+        print(f"[RAGService] Query metadata type: {type(self.query_metadata)}")
+        print(f"[RAGService] Query metadata: {self.query_metadata}")
+    def create_vector_store(self):
+        print("[RAGService] Creating vector store...")
+        self.vector_store = VectorStore(self.chunks, self.embedding_model)
+        self.index, self.namespace = self.vector_store.create_vectorestore()
+        print(f"[RAGService] Vector store created. Index: {self.index}, Namespace: {self.namespace}")
+    def retrive_documents(self):
+        print("[RAGService] Retrieving documents from vector store...")
+        self.retriever = Retriever(self.index,self.query_embedding,self.query_metadata, self.namespace)
+        self.result = self.retriever.retrieval_from_pinecone_vectoreStore()
+        print(f"[RAGService] Retrieval result: {self.result}")
+    def answer_query(self, raw_query:str) -> str:
+        """Answer user query using retrieved documents and LLM"""
+        print(f"[RAGService] Answering query: {raw_query}")
+        top_clause = self.result['matches']
+        top_clause_dicts = [r.to_dict() for r in top_clause]
+        self.top_clauses = top_clause_dicts
+        keys_to_remove = {"file_path", "source", "producer", "keywords", "subject", "added_new_keyword", "author", "chunk_id"}
+        for r in top_clause_dicts:
+            meta = r.get("metadata", {})
+            for k in keys_to_remove:
+                meta.pop(k, None)
+        context_clauses = json.dumps(top_clause_dicts, separators=(",", ":"))
+        print(f"context_clauses: {context_clauses}")
+        prompt = f"""
+        You are a legal/insurance domain expert and policy analyst.
+        Use the following extracted clauses from policy documents to answer the question.
+        If you can't find the answer, say "I don't know".
+        Context clauses:
+        {"".join(context_clauses)}
+        Question: {raw_query}
+        """
+        print("[RAGService] Invoking LLM with prompt...")
+        response = self.llm.invoke(prompt)
+        print(f"[RAGService] LLM response: {response}")
+        # Extract string content from response object
+        if hasattr(response, 'content'):
+            return response.content
+        elif isinstance(response, str):
+            return response
+        else:
+            return str(response)

app/services/__init__.py ADDED Viewed

File without changes

app/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # This file is automatically created to mark the directory as a package.

app/utils/config_loader.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import yaml
+import os
+def load_config(config_path: str = "app/config/config.yaml") -> dict:
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+        # print(config)
+    return config

app/utils/document_op.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+class DocumentOperation:
+    @staticmethod
+    def get_file_type_by_extension(filename):
+        _, extension = os.path.splitext(filename)
+        extension = extension.lower()
+        if extension == ".txt":
+            return "text"
+        elif extension == ".pdf":
+            return "pdf"
+        elif extension in [".doc", ".docx"]:
+            return "word"
+        else:
+            return "unknown"

app/utils/embedding_manager.py ADDED Viewed

File without changes

app/utils/logger.py ADDED Viewed

File without changes

app/utils/metadata_utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from app.schemas.metadata_schema import InsuranceMetadata, CommonMetaData
+from app.schemas.request_models import DocumentTypeSchema
+class MetadataService:
+    def __init__(self):
+        self.metadata_models = {
+            "Insurance": InsuranceMetadata,
+            "HR/Employment": CommonMetaData,
+            "Legal/Compliance": CommonMetaData,
+            "Financial/Regulatory": CommonMetaData,
+            "Government/Public Policy": CommonMetaData,
+            "Technical/IT Policies": CommonMetaData
+        }
+    @staticmethod
+    def format_metadata_for_pinecone(metadata: dict) -> dict:
+        """
+        Convert list fields in metadata to Pinecone's valid filter format using $in.
+        """
+        formatted = {}
+        for key, value in metadata.items():
+            if isinstance(value, list):
+                if len(value) > 0:
+                    formatted[key] = {"$in": value}
+            else:
+                formatted[key] = value
+        return formatted
+    def Return_document_model(self, doc_type_schema: DocumentTypeSchema):
+        """
+        Returns appropriate metadata model based on document type
+        Args:
+            doc_type_schema: DocumentTypeSchema object containing document type
+        Returns:
+            Appropriate Pydantic model class for the document type
+        """
+        doc_type = doc_type_schema.document_types
+        return self.metadata_models.get(doc_type, CommonMetaData)
+    @staticmethod
+    def normalize_dict_to_lists(metadata: dict) -> dict:
+        """Convert dict values to lists if they aren't already"""
+        normalized = {}
+        for key, value in metadata.items():
+            if value is None:
+                normalized[key] = []
+            elif isinstance(value, list):
+                normalized[key] = value
+            else:
+                normalized[key] = [value]
+        return normalized

app/utils/model_loader.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# from config_loader import
+import os
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+from typing import Literal, Optional,Any
+from app.utils.config_loader import load_config
+from langchain_groq import ChatGroq
+from langchain_google_genai import ChatGoogleGenerativeAI
+from dotenv import load_dotenv
+from langchain_huggingface import HuggingFaceEmbeddings
+# from langchain_openai import OpenAIEmbeddings
+from langchain_community.embeddings import OpenAIEmbeddings
+class ConfigLoader:
+    def __init__(self):
+        print(f"Loading config....")
+        self.config = load_config()
+    def __getitem__(self,key):## This method allows you to access config values using dictionary-like syntax
+        return self.config[key]
+class ModelLoader(BaseModel):
+    model_provider: Literal["groq", "gemini", "openai","gemini_lite", "huggingface"] = "gemini"
+    config: Optional[ConfigLoader] = Field(default = None, exclude = True) # either the config is ConfigLoader object or None
+    def model_post_init(self, __context: Any)->None:
+        self.config = ConfigLoader()   # Automatically ensures that whenever you create ModelLoader, it loads the config.. model_post_init is a Pydantic V2 hook, which runs after model creation.It assigns a ConfigLoader() instance to self.config.This ensures the configuration is loaded whenever you create a ModelLoader.
+    class Config:
+        arbitrary_types_allowed = True  # Allows ConfigLoader (a non-Pydantic class) to be used as a field in the model.
+    def load_llm(self):
+        """
+        Load and return the LLM model
+        """
+        print("LLM loading...")
+        print("Loading model from provider: ")
+        if self.model_provider == "groq":
+            print("Loading model from GROQ:")
+            groq_api_key = os.getenv("GROQ_API_KEY")
+            model_name = self.config["llm"]["groq"]["model_name"]
+            llm = ChatGroq(model = model_name, api_key = groq_api_key)
+        elif self.model_provider =="gemini":
+            print("Loading model from gemini:")
+            load_dotenv()
+            gemini_api_key = os.getenv("GEMINI_API_KEY")
+            model_name = self.config["llm"]["gemini"]["model_name"]
+            llm = ChatGoogleGenerativeAI(
+                model=model_name,
+                google_api_key= gemini_api_key
+            )
+        elif self.model_provider =="gemini_lite":
+            print("Loading model from gemini-flash-lite:")
+            load_dotenv()
+            gemini_api_key = os.getenv("GEMINI_API_KEY")
+            model_name = self.config["llm"]["gemini_lite"]["model_name"]
+            llm = ChatGoogleGenerativeAI(
+                model=model_name,
+                google_api_key= gemini_api_key
+            )
+        elif self.model_provider =="openai":
+            load_dotenv()
+            print("Loading model from openai:")
+            api_key = os.getenv("OPENAI_API_KEY")
+            model_name = self.config["embedding_model"]["openai"]["model_name"]
+            llm = OpenAIEmbeddings(model=model_name, api_key = api_key)
+        elif self.model_provider =="huggingface":
+            load_dotenv()
+            print("Loading model from huggingface:")
+            print("HF_TOKEN in env:", os.getenv("HF_TOKEN"))
+            api_key = os.getenv("HF_TOKEN")
+            print(f"HF api key {api_key}")
+            os.environ["HF_TOKEN"] = api_key  # Ensure the token is set in the environment
+            model_name = self.config["embedding_model"]["huggingface"]["model_name"]
+            llm = HuggingFaceEmbeddings(model=model_name)
+        else:
+            raise ValueError(f"Unsupported model provider: {self.model_provider}")
+        return llm

experiments.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from fastapi import FastAPI, HTTPException, Depends, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+import uvicorn
+import os
+import tempfile
+from typing import Optional
+import uuid
+from datetime import datetime
+from app.api.v1.routes import router as api_router
+from app.core.session_manager import session_manager
+from app.config.config import get_settings
+# Initialize FastAPI app
+app = FastAPI(
+    title="ClariDoc API",
+    description="Professional Document Analysis & RAG Platform API",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure this properly for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.on_event("startup")
+async def startup_event():
+    """Initialize database and other startup tasks"""
+    try:
+        # Test database connection
+        session_manager.db.init_db()
+        print("Database connection verified successfully")
+    except Exception as e:
+        print(f"Warning: Database initialization failed: {e}")
+# Include API routes
+app.include_router(api_router, prefix="/api/v1")
+@app.get("/")
+async def root():
+    return {
+        "message": "ClariDoc API",
+        "status": "running",
+        "description": "Professional Document Analysis & RAG Platform"
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint for Docker and monitoring"""
+    try:
+        # Test database connection
+        session_manager.db.init_db()
+        db_status = "healthy"
+    except Exception:
+        db_status = "unhealthy"
+    return {
+        "status": "healthy",
+        "service": "ClariDoc FastAPI Backend",
+        "database": db_status,
+        "timestamp": datetime.now()
+    }
+if __name__ == "__main__":
+    uvicorn.run(
+        "app.main:app",
+        host="0.0.0.0",
+        port=int(os.getenv("PORT", 8000)),
+        reload=False,  # Disable reload in production
+        log_level="info"
+    )

pyproject.toml ADDED Viewed

	@@ -0,0 +1,191 @@

+[project]
+name = "rag-app"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = "==3.12.4"
+dependencies = [
+    "acres==0.5.0",
+    "aiohappyeyeballs==2.6.1",
+    "aiohttp==3.12.15",
+    "aiohttp-retry==2.9.1",
+    "aiosignal==1.4.0",
+    "annotated-types==0.7.0",
+    "anyio==4.10.0",
+    "asttokens==3.0.0",
+    "attrs==25.3.0",
+    "beautifulsoup4==4.13.4",
+    "cachetools==5.5.2",
+    "certifi==2025.8.3",
+    "cffi==1.17.1",
+    "charset-normalizer==3.4.3",
+    "ci-info==0.3.0",
+    "click==8.2.1",
+    "colorama==0.4.6",
+    "colorclass==2.2.2",
+    "comm==0.2.3",
+    "compressed-rtf==1.0.7",
+    "configobj==5.0.9",
+    "configparser==7.2.0",
+    "cryptography==45.0.6",
+    "dataclasses-json==0.6.7",
+    "debugpy==1.8.16",
+    "decorator==5.2.1",
+    "distro==1.9.0",
+    "docx==0.2.4",
+    "docx2txt>=0.9",
+    "easygui==0.98.3",
+    "ebcdic==1.1.1",
+    "etelemetry==0.3.1",
+    "executing==2.2.0",
+    "extract-msg==0.55.0",
+    "fastapi==0.116.1",
+    "filelock==3.18.0",
+    "filetype==1.2.0",
+    "frozenlist==1.7.0",
+    "google-ai-generativelanguage==0.6.18",
+    "google-api-core==2.25.1",
+    "google-auth==2.40.3",
+    "googleapis-common-protos==1.70.0",
+    "greenlet==3.2.4",
+    "groq==0.31.0",
+    "grpcio==1.74.0",
+    "grpcio-status==1.74.0",
+    "h11==0.16.0",
+    "hf-xet>=1.1.8",
+    "httpcore==1.0.9",
+    "httplib2==0.22.0",
+    "httpx==0.28.1",
+    "httpx-sse==0.4.1",
+    "huggingface-hub[cli]>=0.34.4",
+    "idna==3.10",
+    "iniconfig==2.1.0",
+    "ipykernel==6.30.1",
+    "ipython==9.4.0",
+    "ipython-pygments-lexers==1.1.1",
+    "jedi==0.19.2",
+    "jiter==0.10.0",
+    "joblib==1.5.1",
+    "jsonpatch==1.33",
+    "jsonpointer==3.0.0",
+    "jupyter-client==8.6.3",
+    "jupyter-core==5.8.1",
+    "langchain==0.3.27",
+    "langchain-community==0.3.27",
+    "langchain-core==0.3.74",
+    "langchain-google-genai==2.1.9",
+    "langchain-groq==0.3.7",
+    "langchain-huggingface>=0.3.1",
+    "langchain-openai==0.3.29",
+    "langchain-pinecone==0.2.11",
+    "langchain-tests==0.3.20",
+    "langchain-text-splitters==0.3.9",
+    "langextract>=1.0.8",
+    "langsmith==0.4.13",
+    "lark==1.1.9",
+    "looseversion==1.3.0",
+    "lxml==6.0.0",
+    "markdown-it-py==4.0.0",
+    "marshmallow==3.26.1",
+    "matplotlib-inline==0.1.7",
+    "mdurl==0.1.2",
+    "msoffcrypto-tool==5.4.2",
+    "multidict==6.6.4",
+    "mypy-extensions==1.1.0",
+    "nest-asyncio==1.6.0",
+    "networkx==3.5",
+    "nibabel==5.3.2",
+    "nipype==1.10.0",
+    "numpy==2.3.2",
+    "olefile==0.47",
+    "oletools==0.60.2",
+    "openai==1.99.7",
+    "orjson==3.11.1",
+    "packaging==24.2",
+    "pandas==2.3.1",
+    "parso==0.8.4",
+    "pathlib==1.0.1",
+    "pcodedmp==1.2.6",
+    "pillow==11.3.0",
+    "pinecone==7.3.0",
+    "pinecone-plugin-assistant==1.7.0",
+    "pinecone-plugin-interface==0.0.7",
+    "platformdirs==4.3.8",
+    "pluggy==1.6.0",
+    "prompt-toolkit==3.0.51",
+    "propcache==0.3.2",
+    "proto-plus==1.26.1",
+    "protobuf==6.31.1",
+    "prov==2.1.1",
+    "psutil==7.0.0",
+    "pure-eval==0.2.3",
+    "puremagic==1.30",
+    "py-cpuinfo==9.0.0",
+    "pyasn1==0.6.1",
+    "pyasn1-modules==0.4.2",
+    "pycparser==2.22",
+    "pydantic==2.11.7",
+    "pydantic-core==2.33.2",
+    "pydantic-settings==2.10.1",
+    "pydot==4.0.1",
+    "pygments==2.19.2",
+    "pymupdf==1.26.3",
+    "pyparsing==3.2.3",
+    "pypdf>=6.0.0",
+    "pytest==8.4.1",
+    "pytest-asyncio==0.26.0",
+    "pytest-benchmark==5.1.0",
+    "pytest-codspeed==4.0.0",
+    "pytest-recording==0.13.4",
+    "pytest-socket==0.7.0",
+    "python-dateutil==2.9.0.post0",
+    "python-docx>=1.2.0",
+    "python-dotenv==1.1.1",
+    "python-magic>=0.4.27",
+    "python-magic-bin>=0.4.14",
+    "python-multipart>=0.0.20",
+    "pytz==2025.2",
+    "pyxnat==1.6.3",
+    "pyyaml==6.0.2",
+    "pyzmq==27.0.1",
+    "rdflib==7.1.4",
+    "red-black-tree-mod==1.22",
+    "regex==2025.7.34",
+    "requests==2.32.4",
+    "requests-toolbelt==1.0.0",
+    "rich==14.1.0",
+    "rsa==4.9.1",
+    "rtfde==0.1.2.1",
+    "scikit-learn==1.7.1",
+    "scipy==1.16.1",
+    "sentence-transformers>=5.1.0",
+    "simplejson==3.20.1",
+    "six==1.17.0",
+    "sniffio==1.3.1",
+    "soupsieve==2.7",
+    "sqlalchemy==2.0.42",
+    "stack-data==0.6.3",
+    "starlette==0.47.2",
+    "streamlit>=1.49.1",
+    "syrupy==4.9.1",
+    "tenacity==9.1.2",
+    "threadpoolctl==3.6.0",
+    "tiktoken==0.11.0",
+    "tornado==6.5.2",
+    "tqdm==4.67.1",
+    "traitlets==5.14.3",
+    "traits==7.0.2",
+    "typing-extensions==4.14.1",
+    "typing-inspect==0.9.0",
+    "typing-inspection==0.4.1",
+    "tzdata==2025.2",
+    "tzlocal==5.3.1",
+    "urllib3<2",
+    "uvicorn>=0.35.0",
+    "vcrpy==7.0.0",
+    "wcwidth==0.2.13",
+    "win-unicode-console==0.5",
+    "wrapt==1.17.2",
+    "yarl==1.20.1",
+    "zstandard==0.23.0",
+]