Spaces:

Clocksp
/

Insurance-AI

Running

App Files Files Community

Clocksp commited on Jan 17

Commit

97052b8

verified ·

1 Parent(s): 565d0c9

Upload 3 files

Browse files

Files changed (3) hide show

src/utils/pdf_processor.py +202 -0
src/utils/rag_chain.py +305 -0
src/utils/vector_store.py +305 -0

src/utils/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+from typing import List, Dict
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
+from langchain_classic.schema import Document
+from config import Config
+import re
+class PDFProcessor:
+    """Handles PDF loading, parsing, and chunking for insurance documents"""
+    def __init__(self):
+        self.chunking_config = Config.get_chunking_config()
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunking_config["chunk_size"],
+            chunk_overlap=self.chunking_config["chunk_overlap"],
+            separators=self.chunking_config["separators"],
+            length_function=len,
+        )
+    def load_pdf(self, file_path: str) -> List[Document]:
+        """
+        Load PDF file and extract text
+        Args:
+            file_path: Path to the PDF file
+        Returns:
+            List of Document objects with page content and metadata
+        """
+        try:
+            loader = PyPDFLoader(file_path)
+            documents = loader.load()
+            # Add source filename to metadata
+            filename = os.path.basename(file_path)
+            for doc in documents:
+                doc.metadata["source_file"] = filename
+                doc.metadata["total_pages"] = len(documents)
+            print(f"Loaded {len(documents)} pages from {filename}")
+            return documents
+        except Exception as e:
+            print(f"Error loading PDF {file_path}: {str(e)}")
+            raise
+    def extract_metadata(self, documents: List[Document]) -> Dict:
+        """
+        Extract useful metadata from insurance documents
+        Args:
+            documents: List of Document objects
+        Returns:
+            Dictionary containing extracted metadata
+        """
+        metadata = {
+            "total_pages": len(documents),
+            "source_file": documents[0].metadata.get("source_file", "unknown"),
+            "document_type": self._identify_document_type(documents),
+        }
+        return metadata
+    def identify_document_type(self, documents: List[Document]) -> str:
+        """
+        Attempt to identify the type of insurance document
+        Args:
+            documents: List of Document objects
+        Returns:
+            String indicating document type
+        """
+        # Combine first few pages to identify document type
+        sample_text = " ".join([doc.page_content for doc in documents[:3]]).lower()
+        # Common insurance document keywords
+        if "policy schedule" in sample_text or "policy document" in sample_text:
+            return "policy_document"
+        elif "proposal form" in sample_text:
+            return "proposal_form"
+        elif "claim" in sample_text:
+            return "claim_form"
+        elif "endorsement" in sample_text:
+            return "endorsement"
+        elif "add-on" in sample_text or "rider" in sample_text:
+            return "addon_coverage"
+        else:
+            return "general_insurance"
+    def clean_text(self, text: str) -> str:
+        """
+        Clean and normalize text from PDF
+        Args:
+            text: Raw text from PDF
+        Returns:
+            Cleaned text
+        """
+        # Remove excessive whitespace
+        text = " ".join(text.split())
+        text = re.sub(r'\bPage\s+\d+\s+of\s+\d+\b', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'\bPage\s+\d+/\d+\b', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'^\d+$', '', text, flags=re.MULTILINE)
+        return text.strip()
+    def chunk_documents(self, documents: List[Document]) -> List[Document]:
+        """
+        Split documents into chunks optimized for RAG retrieval
+        Args:
+            documents: List of Document objects
+        Returns:
+            List of chunked Document objects with enhanced metadata
+        """
+        # Clean text in all documents
+        for doc in documents:
+            doc.page_content = self.clean_text(doc.page_content)
+        # Split documents into chunks
+        chunks = self.text_splitter.split_documents(documents)
+        # Enhance metadata for each chunk
+        for i, chunk in enumerate(chunks):
+            chunk.metadata["chunk_id"] = i
+            chunk.metadata["chunk_size"] = len(chunk.page_content)
+            # Add context hints based on content
+            content_lower = chunk.page_content.lower()
+            # Identify important sections
+            if any(keyword in content_lower for keyword in ["exclusion", "not covered", "does not cover"]):
+                chunk.metadata["section_type"] = "exclusions"
+            elif any(keyword in content_lower for keyword in ["coverage", "covered", "insured"]):
+                chunk.metadata["section_type"] = "coverage"
+            elif any(keyword in content_lower for keyword in ["premium", "cost", "price"]):
+                chunk.metadata["section_type"] = "pricing"
+            elif any(keyword in content_lower for keyword in ["add-on", "rider", "optional"]):
+                chunk.metadata["section_type"] = "addons"
+            elif any(keyword in content_lower for keyword in ["claim", "settlement"]):
+                chunk.metadata["section_type"] = "claims"
+            else:
+                chunk.metadata["section_type"] = "general"
+        print(f"Created {len(chunks)} chunks from {len(documents)} pages")
+        return chunks
+    def process_pdf(self, file_path: str) -> tuple[List[Document], Dict]:
+        """
+        Complete pipeline: Load, extract metadata, and chunk a PDF
+        Args:
+            file_path: Path to the PDF file
+        Returns:
+            Tuple of (chunks, metadata)
+        """
+        # Load PDF
+        documents = self.load_pdf(file_path)
+        # Extract metadata
+        metadata = self.extract_metadata(documents)
+        # Chunk documents
+        chunks = self.chunk_documents(documents)
+        return chunks, metadata
+    def process_multiple_pdfs(self, file_paths: List[str]) -> tuple[List[Document], List[Dict]]:
+        """
+        Process multiple PDF files
+        Args:
+            file_paths: List of paths to PDF files
+        Returns:
+            Tuple of (all_chunks, all_metadata)
+        """
+        all_chunks = []
+        all_metadata = []
+        for file_path in file_paths:
+            try:
+                chunks, metadata = self.process_pdf(file_path)
+                all_chunks.extend(chunks)
+                all_metadata.append(metadata)
+            except Exception as e:
+                print(f"✗ Failed to process {file_path}: {str(e)}")
+                continue
+        print(f"\n Processed {len(file_paths)} PDFs")
+        print(f"Total chunks created: {len(all_chunks)}")
+        return all_chunks, all_metadata

src/utils/rag_chain.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from typing import List, Dict, Any, Optional
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_groq import ChatGroq
+from langchain_classic.chains import RetrievalQA
+from langchain_classic.prompts import PromptTemplate
+from langchain_classic.schema import Document
+from langchain_classic.callbacks.base import BaseCallbackHandler
+from utils.vector_store import VectorStoreManager
+from config import Config
+class StreamHandler(BaseCallbackHandler):
+    """Callback handler for streaming responses"""
+    def __init__(self):
+        self.text = ""
+    def on_llm_new_token(self, token: str, **kwargs) -> None:
+        """Handle new token from LLM"""
+        self.text += token
+        print(token, end="", flush=True)
+class InsuranceRAGChain:
+    """RAG chain for insurance document Q&A"""
+    def __init__(self, vector_store_manager: Optional[VectorStoreManager] = None):
+        """
+        Initialize RAG chain
+        Args:
+            vector_store_manager: Optional VectorStoreManager instance
+        """
+        # Initialize vector store manager
+        self.vs_manager = vector_store_manager or VectorStoreManager()
+        # Initialize Gemini model
+        self.llm = ChatGoogleGenerativeAI(
+            model=Config.GEMINI_MODEL,
+            google_api_key=Config.GEMINI_API_KEY,
+            temperature=Config.GEMINI_TEMPERATURE,
+            max_output_tokens=Config.GEMINI_MAX_OUTPUT_TOKENS,
+        )
+        # Create prompt template
+        self.prompt_template = PromptTemplate(
+            template=Config.RAG_PROMPT_TEMPLATE,
+            input_variables=["context", "question"]
+        )
+        print("RAG chain initialized")
+    def create_qa_chain(self, chain_type: str = "stuff") -> RetrievalQA:
+        """
+        Create a RetrievalQA chain
+        Args:
+            chain_type: Type of chain ("stuff", "map_reduce", "refine")
+                       "stuff" - puts all docs in context (best for most cases)
+        Returns:
+            RetrievalQA chain
+        """
+        retriever = self.vs_manager.get_retriever()
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=self.llm,
+            chain_type=chain_type,
+            retriever=retriever,
+            return_source_documents=True,
+            chain_type_kwargs={"prompt": self.prompt_template}
+        )
+        return qa_chain
+    def query(self, question: str, return_sources: bool = True) -> Dict[str, Any]:
+        """
+        Query the RAG system
+        Args:
+            question: User's question
+            return_sources: Whether to return source documents
+        Returns:
+            Dictionary with answer and optional source documents
+        """
+        try:
+            # Create QA chain
+            qa_chain = self.create_qa_chain()
+            # Run query
+            result = qa_chain.invoke({"query": question})
+            response = {
+                "answer": result["result"],
+                "question": question
+            }
+            if return_sources and "source_documents" in result:
+                response["sources"] = self._format_sources(result["source_documents"])
+                response["source_documents"] = result["source_documents"]
+            return response
+        except Exception as e:
+            print(f" Error during query: {str(e)}")
+            raise
+    def query_with_context(
+        self,
+        question: str,
+        conversation_history: Optional[List[Dict[str, str]]] = None
+    ) -> Dict[str, Any]:
+        """
+        Query with conversation context
+        Args:
+            question: User's question
+            conversation_history: List of previous Q&A pairs
+        Returns:
+            Dictionary with answer and sources
+        """
+        # Build contextualized question if history exists
+        if conversation_history and len(conversation_history) > 0:
+            context = "\n".join([
+                f"Previous Q: {item['question']}\nPrevious A: {item['answer']}"
+                for item in conversation_history[-3:]  # Last 3 turns
+            ])
+            contextualized_question = f"Conversation context:\n{context}\n\nCurrent question: {question}"
+        else:
+            contextualized_question = question
+        return self.query(contextualized_question, return_sources=True)
+    def query_specific_section(
+        self,
+        question: str,
+        section_type: str
+    ) -> Dict[str, Any]:
+        """
+        Query a specific section type (exclusions, addons, coverage, etc.)
+        Args:
+            question: User's question
+            section_type: Section to search in
+        Returns:
+            Dictionary with answer and sources
+        """
+        try:
+            # Get relevant documents from specific section
+            docs = self.vs_manager.search_by_section_type(
+                query=question,
+                section_type=section_type,
+                k=5
+            )
+            if not docs:
+                return {
+                    "answer": f"No relevant information found in {section_type} section.",
+                    "question": question,
+                    "sources": []
+                }
+            # Build context from retrieved documents
+            context = "\n\n".join([doc.page_content for doc in docs])
+            # Format prompt
+            prompt = self.prompt_template.format(
+                context=context,
+                question=question
+            )
+            # Get response from LLM
+            response = self.llm.invoke(prompt)
+            return {
+                "answer": response.content,
+                "question": question,
+                "sources": self._format_sources(docs),
+                "source_documents": docs
+            }
+        except Exception as e:
+            print(f"Error querying specific section: {str(e)}")
+            raise
+    def compare_addons(self, addon_names: List[str]) -> Dict[str, Any]:
+        """
+        Compare multiple add-ons
+        Args:
+            addon_names: List of add-on names to compare
+        Returns:
+            Dictionary with comparison and sources
+        """
+        question = f"Compare the following add-ons and explain their key differences, coverage, and benefits: {', '.join(addon_names)}"
+        return self.query_specific_section(question, section_type="addons")
+    def find_coverage_gaps(self, current_coverage_description: str) -> Dict[str, Any]:
+        """
+        Identify potential coverage gaps
+        Args:
+            current_coverage_description: Description of current coverage
+        Returns:
+            Dictionary with gap analysis and recommendations
+        """
+        question = f"""Based on this current coverage: {current_coverage_description}
+        Please identify:
+        1. What scenarios or risks are NOT covered
+        2. What add-ons or riders could fill these gaps
+        3. Which gaps are most important to address"""
+        return self.query(question, return_sources=True)
+    def explain_terms(self, terms: List[str]) -> Dict[str, Any]:
+        """
+        Explain insurance terms in plain language
+        Args:
+            terms: List of insurance terms to explain
+        Returns:
+            Dictionary with explanations
+        """
+        question = f"Explain these insurance terms in simple language: {', '.join(terms)}"
+        return self.query(question, return_sources=True)
+    def format_sources(self, documents: List[Document]) -> List[Dict[str, Any]]:
+        """
+        Format source documents for display
+        Args:
+            documents: List of source documents
+        Returns:
+            List of formatted source information
+        """
+        sources = []
+        for i, doc in enumerate(documents, 1):
+            source_info = {
+                "index": i,
+                "source_file": doc.metadata.get("source_file", "Unknown"),
+                "page": doc.metadata.get("page", "Unknown"),
+                "section_type": doc.metadata.get("section_type", "general"),
+                "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
+            }
+            sources.append(source_info)
+        return sources
+    def stream_query(self, question: str) -> tuple[str, List[Dict[str, Any]]]:
+        """
+        Query with streaming response
+        Args:
+            question: User's question
+        Returns:
+            Tuple of (answer, sources)
+        """
+        try:
+            # Get relevant documents using invoke method
+            retriever = self.vs_manager.get_retriever()
+            docs = retriever.invoke(question)
+            if not docs:
+                return "No relevant information found in the documents.", []
+            # Build context
+            context = "\n\n".join([doc.page_content for doc in docs])
+            # Format prompt
+            prompt = self.prompt_template.format(
+                context=context,
+                question=question
+            )
+            # Stream response
+            print("\n Assistant: ", end="")
+            stream_handler = StreamHandler()
+            streaming_llm = ChatGoogleGenerativeAI(
+                model=Config.GEMINI_MODEL,
+                google_api_key=Config.GEMINI_API_KEY,
+                temperature=Config.GEMINI_TEMPERATURE,
+                streaming=True,
+                callbacks=[stream_handler]
+            )
+            streaming_llm.invoke(prompt)
+            print("\n")
+            return stream_handler.text, self._format_sources(docs)
+        except Exception as e:
+            print(f" Error during streaming query: {str(e)}")
+            raise

src/utils/vector_store.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from typing import List, Optional, Dict, Any
+from langchain_classic.schema import Document
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+from langchain_qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct
+from config import Config
+import uuid
+class VectorStoreManager:
+    """Manages Qdrant vector store operations for insurance documents"""
+    def __init__(self):
+        """Initialize Qdrant client and embeddings"""
+        # Validate configuration
+        Config.validate_config()
+        # Get configuration
+        self.qdrant_config = Config.get_qdrant_config()
+        self.retrieval_config = Config.get_retrieval_config()
+        # Initialize Qdrant client
+        self.client = QdrantClient(
+            url=self.qdrant_config["url"],
+            api_key=self.qdrant_config["api_key"],
+        )
+        # Initialize embeddings
+        self.embeddings = GoogleGenerativeAIEmbeddings(
+            model=Config.EMBEDDING_MODEL,
+            google_api_key=Config.GEMINI_API_KEY
+        )
+        self.collection_name = self.qdrant_config["collection_name"]
+        print("Vector store manager initialized")
+    def create_collection(self, recreate: bool = False) -> bool:
+        """
+        Create a new collection in Qdrant
+        Args:
+            recreate: If True, delete existing collection and create new one
+        Returns:
+            Boolean indicating success
+        """
+        try:
+            # Check if collection exists
+            collections = self.client.get_collections().collections
+            collection_exists = any(c.name == self.collection_name for c in collections)
+            if collection_exists:
+                if recreate:
+                    print(f"⚠ Deleting existing collection: {self.collection_name}")
+                    self.client.delete_collection(self.collection_name)
+                else:
+                    print(f" Collection '{self.collection_name}' already exists")
+                    return True
+            # Create new collection
+            self.client.create_collection(
+                collection_name=self.collection_name,
+                vectors_config=VectorParams(
+                    size=self.qdrant_config["vector_size"],
+                    distance=Distance.COSINE
+                )
+            )
+            print(f" Created collection: {self.collection_name}")
+            return True
+        except Exception as e:
+            print(f" Error creating collection: {str(e)}")
+            raise
+    def add_documents(self, documents: List[Document], batch_size: int = 100) -> List[str]:
+        """
+        Add documents to Qdrant vector store
+        Args:
+            documents: List of Document objects to add
+            batch_size: Number of documents to process in each batch
+        Returns:
+            List of document IDs
+        """
+        try:
+            print(f"Adding {len(documents)} documents to vector store...")
+            # Ensure collection exists
+            self.create_collection(recreate=False)
+            # Initialize vector store
+            vector_store = QdrantVectorStore(
+                client=self.client,
+                collection_name=self.collection_name,
+                embedding=self.embeddings
+            )
+            # Add documents in batches
+            all_ids = []
+            for i in range(0, len(documents), batch_size):
+                batch = documents[i:i + batch_size]
+                # Generate unique IDs for this batch
+                batch_ids = [str(uuid.uuid4()) for _ in batch]
+                # Add to vector store
+                vector_store.add_documents(documents=batch, ids=batch_ids)
+                all_ids.extend(batch_ids)
+                print(f"   Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
+            print(f" Successfully added {len(documents)} documents")
+            return all_ids
+        except Exception as e:
+            print(f" Error adding documents: {str(e)}")
+            raise
+    def similarity_search(
+        self,
+        query: str,
+        k: Optional[int] = None,
+        filter_dict: Optional[Dict[str, Any]] = None
+    ) -> List[Document]:
+        """
+        Search for similar documents using semantic similarity
+        Args:
+            query: Search query string
+            k: Number of results to return (default from config)
+            filter_dict: Optional metadata filters (e.g., {"section_type": "exclusions"})
+        Returns:
+            List of most similar Documents
+        """
+        try:
+            if k is None:
+                k = self.retrieval_config["top_k"]
+            # Initialize vector store for querying
+            vector_store = QdrantVectorStore(
+                client=self.client,
+                collection_name=self.collection_name,
+                embedding=self.embeddings
+            )
+            if filter_dict:
+                # Get more results than needed
+                results = vector_store.similarity_search(query=query, k=k*3)
+                # Filter by metadata
+                filtered_results = []
+                for doc in results:
+                    match = True
+                    for key, value in filter_dict.items():
+                        if doc.metadata.get(key) != value:
+                            match = False
+                            break
+                    if match:
+                        filtered_results.append(doc)
+                    # Stop when we have enough results
+                    if len(filtered_results) >= k:
+                        break
+                return filtered_results[:k]
+            else:
+                results = vector_store.similarity_search(query=query, k=k)
+                return results
+        except Exception as e:
+            print(f" Error during similarity search: {str(e)}")
+            raise
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: Optional[int] = None,
+        score_threshold: Optional[float] = None
+    ) -> List[tuple[Document, float]]:
+        """
+        Search with similarity scores
+        Args:
+            query: Search query string
+            k: Number of results to return
+            score_threshold: Minimum similarity score (default from config)
+        Returns:
+            List of (Document, score) tuples
+        """
+        try:
+            if k is None:
+                k = self.retrieval_config["top_k"]
+            if score_threshold is None:
+                score_threshold = self.retrieval_config["similarity_threshold"]
+            # Initialize vector store
+            vector_store = QdrantVectorStore(
+                client=self.client,
+                collection_name=self.collection_name,
+                embedding=self.embeddings
+            )
+            # Search with scores
+            results = vector_store.similarity_search_with_score(query=query, k=k)
+            # Filter by score threshold
+            filtered_results = [
+                (doc, score) for doc, score in results
+                if score >= score_threshold
+            ]
+            print(f" Found {len(filtered_results)} results above threshold {score_threshold}")
+            return filtered_results
+        except Exception as e:
+            print(f" Error during similarity search with score: {str(e)}")
+            raise
+    def search_by_section_type(
+        self,
+        query: str,
+        section_type: str,
+        k: Optional[int] = None
+    ) -> List[Document]:
+        """
+        Search within a specific section type (e.g., 'exclusions', 'addons')
+        Args:
+            query: Search query string
+            section_type: Type of section to search in
+            k: Number of results to return
+        Returns:
+            List of Documents from specified section type
+        """
+        filter_dict = {"section_type": section_type}
+        return self.similarity_search(query=query, k=k, filter_dict=filter_dict)
+    def get_collection_info(self) -> Dict:
+        """
+        Get information about the current collection
+        Returns:
+            Dictionary with collection statistics
+        """
+        try:
+            collection_info = self.client.get_collection(self.collection_name)
+            return {
+                "name": self.collection_name,
+                "vectors_count": collection_info.vectors_count,
+                "points_count": collection_info.points_count,
+                "status": collection_info.status,
+            }
+        except Exception as e:
+            print(f" Error getting collection info: {str(e)}")
+            return {}
+    def delete_collection(self) -> bool:
+        """
+        Delete the current collection
+        Returns:
+            Boolean indicating success
+        """
+        try:
+            self.client.delete_collection(self.collection_name)
+            print(f" Deleted collection: {self.collection_name}")
+            return True
+        except Exception as e:
+            print(f" Error deleting collection: {str(e)}")
+            return False
+    def get_retriever(self, **kwargs):
+        """
+        Get a LangChain retriever object for use in chains
+        Args:
+            **kwargs: Additional arguments for retriever configuration
+        Returns:
+            VectorStoreRetriever object
+        """
+        vector_store = QdrantVectorStore(
+            client=self.client,
+            collection_name=self.collection_name,
+            embedding=self.embeddings
+        )
+        # Set default search kwargs
+        search_kwargs = {
+            "k": self.retrieval_config["top_k"]
+        }
+        search_kwargs.update(kwargs)
+        return vector_store.as_retriever(search_kwargs=search_kwargs)