Spaces:

Maheshmahi04
/

DocuBot

Sleeping

App Files Files Community

MaheshLEO4 commited on Jan 1

Commit

b7e0e53

0 Parent(s):

Initial commit for DocChat

Browse files

Files changed (20) hide show

.env +15 -0
agents/__init.py__ +5 -0
agents/relevance_checker.py +89 -0
agents/research_agent.py +74 -0
agents/verification_agent.py +134 -0
agents/workflow.py +132 -0
app.py +171 -0
config/__init.py__ +4 -0
config/__pycache__/llm_config.cpython-313.pyc +0 -0
config/constants.py +8 -0
config/llm_config.py +142 -0
config/settings.py +38 -0
config/test.py +49 -0
document_processor/__init.py__ +3 -0
document_processor/file_handler.py +92 -0
requirements.txt +67 -0
retriever/__init.py__ +3 -0
retriever/builder.py +55 -0
utils/__init.py__ +3 -0
utils/logging.py +8 -0

.env ADDED Viewed

	@@ -0,0 +1,15 @@

+# LLM Configuration
+LLM_PROVIDER=google  # google or openai
+# API Keys
+GOOGLE_API_KEY="AIzaSyCXbE6aDpC20WuQWZVR8ULA7LFOT9y6000"
+OPENAI_API_KEY="your_openai_api_key_here"
+# Database Settings
+CHROMA_DB_PATH=./chroma_db
+# Retrieval Settings
+VECTOR_SEARCH_K=10
+# Cache Settings
+CACHE_EXPIRE_DAYS=7

agents/__init.py__ ADDED Viewed

	@@ -0,0 +1,5 @@

+from .research_agent import ResearchAgent
+from .verification_agent import VerificationAgent
+from .workflow import AgentWorkflow
+__all__ = ["ResearchAgent", "VerificationAgent", "AgentWorkflow"]

agents/relevance_checker.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from typing import Dict, List
+from langchain.schema import BaseRetriever
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from config.llm_config import llm_config
+import logging
+logger = logging.getLogger(__name__)
+class RelevanceChecker:
+    def __init__(self):
+        """Initialize the relevance checker with configurable LLM."""
+        logger.info("Initializing RelevanceChecker...")
+        # Get LLM from configuration
+        self.llm = llm_config.create_llm("relevance")
+        # Create prompt template
+        self.prompt_template = ChatPromptTemplate.from_messages([
+            ("system", """You are an AI relevance checker between a user's question and provided document content.
+Instructions:
+- Classify how well the document content addresses the user's question.
+- Respond with ONLY ONE of the following labels: CAN_ANSWER, PARTIAL, NO_MATCH.
+- Do not include any additional text or explanation.
+Label Definitions:
+1) "CAN_ANSWER": The passages contain enough explicit information to fully answer the question.
+2) "PARTIAL": The passages mention or discuss the question's topic but do not provide all the details needed for a complete answer.
+3) "NO_MATCH": The passages do not discuss or mention the question's topic at all.
+Important: If the passages mention or reference the topic or timeframe of the question in any way, even if incomplete, respond with "PARTIAL" instead of "NO_MATCH"."""),
+            ("human", """Question: {question}
+Passages:
+{passages}
+Respond ONLY with one of the following labels: CAN_ANSWER, PARTIAL, NO_MATCH""")
+        ])
+        # Create chain
+        self.chain = self.prompt_template | self.llm | StrOutputParser()
+        logger.info("RelevanceChecker initialized successfully.")
+    def check(self, question: str, retriever: BaseRetriever, k: int = 3) -> str:
+        """
+        Check relevance between question and retrieved documents.
+        Returns: "CAN_ANSWER", "PARTIAL", or "NO_MATCH".
+        """
+        logger.debug(f"RelevanceChecker.check called with question='{question}' and k={k}")
+        # Retrieve document chunks
+        try:
+            top_docs = retriever.invoke(question)
+        except Exception as e:
+            logger.error(f"Error retrieving documents: {e}")
+            return "NO_MATCH"
+        if not top_docs:
+            logger.debug("No documents returned from retriever.")
+            return "NO_MATCH"
+        # Combine the top k chunk texts
+        document_content = "\n\n".join(doc.page_content for doc in top_docs[:k])
+        logger.debug(f"Combined document content length: {len(document_content)} characters")
+        try:
+            # Get classification from LLM
+            response = self.chain.invoke({
+                "question": question,
+                "passages": document_content
+            })
+            # Clean and validate response
+            classification = response.strip().upper()
+            valid_labels = {"CAN_ANSWER", "PARTIAL", "NO_MATCH"}
+            if classification not in valid_labels:
+                logger.warning(f"Invalid classification received: '{classification}'. Defaulting to NO_MATCH.")
+                classification = "NO_MATCH"
+            logger.debug(f"Classification: {classification}")
+            return classification
+        except Exception as e:
+            logger.error(f"Error during relevance classification: {e}")
+            return "NO_MATCH"

agents/research_agent.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Dict, List
+from langchain.schema import Document
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from config.llm_config import llm_config
+import logging
+logger = logging.getLogger(__name__)
+class ResearchAgent:
+    def __init__(self):
+        """
+        Initialize the research agent with configurable LLM.
+        """
+        logger.info("Initializing ResearchAgent...")
+        # Get LLM from configuration
+        self.llm = llm_config.create_llm("research")
+        self.client = llm_config.create_direct_client()
+        # Create prompt template
+        self.prompt_template = ChatPromptTemplate.from_messages([
+            ("system", """You are an AI assistant designed to provide precise and factual answers based on the given context.
+Instructions:
+- Answer the following question using only the provided context.
+- Be clear, concise, and factual.
+- Return as much information as you can get from the context.
+- If the context doesn't contain enough information, say so explicitly.
+- Do not add any information not present in the context.
+- Format your answer in a clear, readable manner."""),
+            ("human", """Question: {question}
+Context:
+{context}
+Provide your answer below:""")
+        ])
+        # Create chain
+        self.chain = self.prompt_template | self.llm | StrOutputParser()
+        logger.info("ResearchAgent initialized successfully.")
+    def generate(self, question: str, documents: List[Document]) -> Dict:
+        """
+        Generate an initial answer using the provided documents.
+        """
+        logger.info(f"ResearchAgent.generate called with question='{question}' and {len(documents)} documents.")
+        # Combine the document contents
+        context = "\n\n".join([doc.page_content for doc in documents])
+        logger.debug(f"Combined context length: {len(context)} characters.")
+        try:
+            # Generate answer using LangChain chain
+            draft_answer = self.chain.invoke({
+                "question": question,
+                "context": context
+            })
+            logger.info(f"Generated answer successfully. Length: {len(draft_answer)} characters.")
+            return {
+                "draft_answer": draft_answer.strip(),
+                "context_used": context
+            }
+        except Exception as e:
+            logger.error(f"Error during answer generation: {e}")
+            return {
+                "draft_answer": f"I cannot answer this question based on the provided documents. Error: {str(e)}",
+                "context_used": context
+            }

agents/verification_agent.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from typing import Dict, List
+from langchain.schema import Document
+from langchain.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from config.llm_config import llm_config
+import logging
+import json
+logger = logging.getLogger(__name__)
+class VerificationAgent:
+    def __init__(self):
+        """
+        Initialize the verification agent with configurable LLM.
+        """
+        logger.info("Initializing VerificationAgent...")
+        # Get LLM from configuration
+        self.llm = llm_config.create_llm("verification")
+        # Create prompt template for verification
+        self.prompt_template = ChatPromptTemplate.from_messages([
+            ("system", """You are an AI assistant designed to verify the accuracy and relevance of answers based on provided context.
+You MUST respond in the exact JSON format specified below.
+Instructions:
+- Verify the answer against the provided context.
+- Check for:
+  1. Direct/indirect factual support (YES/NO)
+  2. Unsupported claims (list any if present)
+  3. Contradictions (list any if present)
+  4. Relevance to the question (YES/NO)
+- Provide additional details or explanations where relevant.
+- If there are no unsupported claims or contradictions, use empty lists.
+- If there are no additional details, use an empty string.
+JSON Response Format:
+{
+  "supported": "YES" or "NO",
+  "unsupported_claims": ["claim1", "claim2", ...],
+  "contradictions": ["contradiction1", "contradiction2", ...],
+  "relevant": "YES" or "NO",
+  "additional_details": "string"
+}"""),
+            ("human", """Answer to verify: {answer}
+Context:
+{context}
+Provide your verification in the specified JSON format:""")
+        ])
+        # Create chain
+        self.chain = self.prompt_template | self.llm | StrOutputParser()
+        logger.info("VerificationAgent initialized successfully.")
+    def format_verification_report(self, verification: Dict) -> str:
+        """
+        Format the verification report dictionary into a readable paragraph.
+        """
+        supported = verification.get("supported", "NO")
+        unsupported_claims = verification.get("unsupported_claims", [])
+        contradictions = verification.get("contradictions", [])
+        relevant = verification.get("relevant", "NO")
+        additional_details = verification.get("additional_details", "")
+        report = f"**Supported:** {supported}\n"
+        if unsupported_claims:
+            report += f"**Unsupported Claims:** {', '.join(unsupported_claims)}\n"
+        else:
+            report += f"**Unsupported Claims:** None\n"
+        if contradictions:
+            report += f"**Contradictions:** {', '.join(contradictions)}\n"
+        else:
+            report += f"**Contradictions:** None\n"
+        report += f"**Relevant:** {relevant}\n"
+        if additional_details:
+            report += f"**Additional Details:** {additional_details}\n"
+        else:
+            report += f"**Additional Details:** None\n"
+        return report
+    def check(self, answer: str, documents: List[Document]) -> Dict:
+        """
+        Verify the answer against the provided documents.
+        """
+        logger.info(f"VerificationAgent.check called with answer length={len(answer)} and {len(documents)} documents.")
+        # Combine all document contents
+        context = "\n\n".join([doc.page_content for doc in documents])
+        logger.debug(f"Combined context length: {len(context)} characters.")
+        try:
+            # Get verification from LLM
+            response = self.chain.invoke({
+                "answer": answer,
+                "context": context
+            })
+            # Parse JSON response
+            try:
+                verification = json.loads(response)
+            except json.JSONDecodeError as e:
+                logger.error(f"Failed to parse JSON response: {e}")
+                verification = {
+                    "supported": "NO",
+                    "unsupported_claims": [],
+                    "contradictions": [],
+                    "relevant": "NO",
+                    "additional_details": "Failed to parse verification response."
+                }
+            # Format report
+            verification_report = self.format_verification_report(verification)
+            logger.info("Verification completed successfully.")
+            return {
+                "verification_report": verification_report,
+                "context_used": context
+            }
+        except Exception as e:
+            logger.error(f"Error during verification: {e}")
+            return {
+                "verification_report": f"**Error during verification:** {str(e)}",
+                "context_used": context
+            }

agents/workflow.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from langgraph.graph import StateGraph, END
+from typing import TypedDict, List, Dict
+from .research_agent import ResearchAgent
+from .verification_agent import VerificationAgent
+from .relevance_checker import RelevanceChecker
+from langchain.schema import Document
+from langchain.retrievers import EnsembleRetriever
+import logging
+logger = logging.getLogger(__name__)
+class AgentState(TypedDict):
+    question: str
+    documents: List[Document]
+    draft_answer: str
+    verification_report: str
+    is_relevant: bool
+    retriever: EnsembleRetriever
+class AgentWorkflow:
+    def __init__(self):
+        self.researcher = ResearchAgent()
+        self.verifier = VerificationAgent()
+        self.relevance_checker = RelevanceChecker()
+        self.compiled_workflow = self.build_workflow()  # Compile once during initialization
+    def build_workflow(self):
+        """Create and compile the multi-agent workflow."""
+        workflow = StateGraph(AgentState)
+        # Add nodes
+        workflow.add_node("check_relevance", self._check_relevance_step)
+        workflow.add_node("research", self._research_step)
+        workflow.add_node("verify", self._verification_step)
+        # Define edges
+        workflow.set_entry_point("check_relevance")
+        workflow.add_conditional_edges(
+            "check_relevance",
+            self._decide_after_relevance_check,
+            {
+                "relevant": "research",
+                "irrelevant": END
+            }
+        )
+        workflow.add_edge("research", "verify")
+        workflow.add_conditional_edges(
+            "verify",
+            self._decide_next_step,
+            {
+                "re_research": "research",
+                "end": END
+            }
+        )
+        return workflow.compile()
+    def _check_relevance_step(self, state: AgentState) -> Dict:
+        retriever = state["retriever"]
+        classification = self.relevance_checker.check(
+            question=state["question"],
+            retriever=retriever,
+            k=20
+        )
+        if classification == "CAN_ANSWER":
+            # We have enough info to proceed
+            return {"is_relevant": True}
+        elif classification == "PARTIAL":
+            # There's partial coverage, but we can still proceed
+            return {
+                "is_relevant": True
+            }
+        else:  # classification == "NO_MATCH"
+            return {
+                "is_relevant": False,
+                "draft_answer": "This question isn't related (or there's no data) for your query. Please ask another question relevant to the uploaded document(s)."
+            }
+    def _decide_after_relevance_check(self, state: AgentState) -> str:
+        decision = "relevant" if state["is_relevant"] else "irrelevant"
+        print(f"[DEBUG] _decide_after_relevance_check -> {decision}")
+        return decision
+    def full_pipeline(self, question: str, retriever: EnsembleRetriever):
+        try:
+            print(f"[DEBUG] Starting full_pipeline with question='{question}'")
+            documents = retriever.invoke(question)
+            logger.info(f"Retrieved {len(documents)} relevant documents (from .invoke)")
+            initial_state = AgentState(
+                question=question,
+                documents=documents,
+                draft_answer="",
+                verification_report="",
+                is_relevant=False,
+                retriever=retriever
+            )
+            final_state = self.compiled_workflow.invoke(initial_state)
+            return {
+                "draft_answer": final_state["draft_answer"],
+                "verification_report": final_state["verification_report"]
+            }
+        except Exception as e:
+            logger.error(f"Workflow execution failed: {e}")
+            raise
+    def _research_step(self, state: AgentState) -> Dict:
+        print(f"[DEBUG] Entered _research_step with question='{state['question']}'")
+        result = self.researcher.generate(state["question"], state["documents"])
+        print("[DEBUG] Researcher returned draft answer.")
+        return {"draft_answer": result["draft_answer"]}
+    def _verification_step(self, state: AgentState) -> Dict:
+        print("[DEBUG] Entered _verification_step. Verifying the draft answer...")
+        result = self.verifier.check(state["draft_answer"], state["documents"])
+        print("[DEBUG] VerificationAgent returned a verification report.")
+        return {"verification_report": result["verification_report"]}
+    def _decide_next_step(self, state: AgentState) -> str:
+        verification_report = state["verification_report"]
+        print(f"[DEBUG] _decide_next_step with verification_report='{verification_report}'")
+        if "Supported: NO" in verification_report or "Relevant: NO" in verification_report:
+            logger.info("[DEBUG] Verification indicates re-research needed.")
+            return "re_research"
+        else:
+            logger.info("[DEBUG] Verification successful, ending workflow.")
+            return "end"

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import gradio as gr
+import hashlib
+from typing import List, Dict
+import os
+from document_processor.file_handler import DocumentProcessor
+from retriever.builder import RetrieverBuilder
+from agents.workflow import AgentWorkflow
+from config import constants, settings
+from utils.logging import logger
+# -------------------------
+# Example Data
+# -------------------------
+EXAMPLES = {
+    "Google 2024 Environmental Report": {
+        "question": "Retrieve the data center PUE efficiency values in Singapore 2nd facility in 2019 and 2022. Also retrieve regional average CFE in Asia pacific in 2023",
+        "file_paths": ["examples/google-2024-environmental-report.pdf"]
+    },
+    "DeepSeek-R1 Technical Report": {
+        "question": "Summarize DeepSeek-R1 model's performance evaluation on all coding tasks against OpenAI o1-mini model",
+        "file_paths": ["examples/DeepSeek Technical Report.pdf"]
+    }
+}
+# -------------------------
+# Utils
+# -------------------------
+def _get_file_hashes(uploaded_files: List) -> frozenset:
+    """Generate SHA-256 hashes for uploaded files."""
+    hashes = set()
+    for file in uploaded_files:
+        with open(file.name, "rb") as f:
+            hashes.add(hashlib.sha256(f.read()).hexdigest())
+    return frozenset(hashes)
+# -------------------------
+# Main App
+# -------------------------
+def main():
+    processor = DocumentProcessor()
+    retriever_builder = RetrieverBuilder()
+    workflow = AgentWorkflow()
+    # -------------------------
+    # Custom CSS
+    # -------------------------
+    css = """
+    .title {
+        font-size: 1.5em !important;
+        text-align: center !important;
+        color: #FFD700;
+    }
+    .subtitle {
+        font-size: 1em !important;
+        text-align: center !important;
+        color: #FFD700;
+    }
+    .text {
+        text-align: center;
+    }
+    """
+    # -------------------------
+    # Gradio UI
+    # -------------------------
+    with gr.Blocks(theme=gr.themes.Citrus(), title="DocChat 🐥", css=css) as demo:
+        gr.Markdown("## DocChat: powered by Docling 🐥 and LangGraph", elem_classes="subtitle")
+        gr.Markdown("# How it works ✨:", elem_classes="title")
+        gr.Markdown("📤 Upload your document(s), enter your query then hit Submit 📝", elem_classes="text")
+        gr.Markdown("Or you can select one of the examples from the drop-down menu, select Load Example then hit Submit 📝", elem_classes="text")
+        gr.Markdown("⚠️ **Note:** DocChat only accepts documents in these formats: '.pdf', '.docx', '.txt', '.md'", elem_classes="text")
+        # Session state
+        session_state = gr.State({
+            "file_hashes": frozenset(),
+            "retriever": None
+        })
+        # -------------------------
+        # Layout
+        # -------------------------
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Example 📂")
+                example_dropdown = gr.Dropdown(
+                    label="Select an Example 🐥",
+                    choices=list(EXAMPLES.keys()),
+                    value=None
+                )
+                load_example_btn = gr.Button("Load Example 🛠️")
+                files = gr.Files(label="📄 Upload Documents", file_types=constants.ALLOWED_TYPES)
+                question = gr.Textbox(label="❓ Question", lines=3)
+                submit_btn = gr.Button("Submit 🚀")
+            with gr.Column():
+                answer_output = gr.Textbox(label="🐥 Answer", interactive=False)
+                verification_output = gr.Textbox(label="✅ Verification Report")
+        # -------------------------
+        # Load Example Function
+        # -------------------------
+        def load_example(example_key: str):
+            if not example_key or example_key not in EXAMPLES:
+                return [], ""
+            ex_data = EXAMPLES[example_key]
+            file_paths = ex_data["file_paths"]
+            question_text = ex_data["question"]
+            loaded_files = []
+            for path in file_paths:
+                if os.path.exists(path):
+                    loaded_files.append(path)
+                else:
+                    logger.warning(f"File not found: {path}")
+            return loaded_files, question_text
+        load_example_btn.click(
+            fn=load_example,
+            inputs=[example_dropdown],
+            outputs=[files, question]
+        )
+        # -------------------------
+        # Process Question
+        # -------------------------
+        def process_question(question_text: str, uploaded_files: List, state: Dict):
+            try:
+                if not question_text.strip():
+                    raise ValueError("❌ Question cannot be empty")
+                if not uploaded_files:
+                    raise ValueError("❌ No documents uploaded")
+                current_hashes = _get_file_hashes(uploaded_files)
+                if state["retriever"] is None or current_hashes != state["file_hashes"]:
+                    logger.info("Processing new/changed documents...")
+                    chunks = processor.process(uploaded_files)
+                    retriever = retriever_builder.build_hybrid_retriever(chunks)
+                    state.update({
+                        "file_hashes": current_hashes,
+                        "retriever": retriever
+                    })
+                result = workflow.full_pipeline(
+                    question=question_text,
+                    retriever=state["retriever"]
+                )
+                return result["draft_answer"], result["verification_report"], state
+            except Exception as e:
+                logger.error(f"Processing error: {str(e)}")
+                return f"❌ Error: {str(e)}", "", state
+        submit_btn.click(
+            fn=process_question,
+            inputs=[question, files, session_state],
+            outputs=[answer_output, verification_output, session_state]
+        )
+    # -------------------------
+    # Hugging Face launch (no local args)
+    # -------------------------
+    demo.launch()
+# -------------------------
+# Run App
+# -------------------------
+if __name__ == "__main__":
+    main()

config/__init.py__ ADDED Viewed

	@@ -0,0 +1,4 @@

+from .settings import settings
+from .constants import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
+__all__ = ["settings", "MAX_FILE_SIZE", "MAX_TOTAL_SIZE", "ALLOWED_TYPES"]

config/__pycache__/llm_config.cpython-313.pyc ADDED Viewed

Binary file (6.21 kB). View file

config/constants.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Maximum allowed size for a single file (50 MB)
+MAX_FILE_SIZE: int = 50 * 1024 * 1024
+# Maximum allowed total size for all uploaded files (200 MB)
+MAX_TOTAL_SIZE: int = 200 * 1024 * 1024
+# Allowed file types for upload
+ALLOWED_TYPES: list = [".txt", ".pdf", ".docx", ".md"]

config/llm_config.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+LLM Configuration Manager
+Centralizes all LLM model configurations for easy switching
+"""
+from typing import Dict, Any, Optional
+from enum import Enum
+import os
+from google import genai
+from google.genai import types
+from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
+import logging
+from dotenv import load_dotenv
+load_dotenv()
+logger = logging.getLogger(__name__)
+class ModelProvider(Enum):
+    """Supported LLM providers"""
+    GOOGLE = "google"
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+class LLMConfig:
+    """Configuration manager for LLM models"""
+    # Model configurations for different tasks
+    MODELS = {
+        ModelProvider.GOOGLE: {
+            "research": "gemini-1.5-pro",
+            "verification": "gemini-1.5-flash",
+            "relevance": "gemini-1.5-flash",
+            "embedding": "text-embedding-004",
+        },
+        ModelProvider.OPENAI: {
+            "research": "gpt-4-turbo",
+            "verification": "gpt-4-turbo",
+            "relevance": "gpt-4-turbo",
+            "embedding": "text-embedding-3-large",
+        }
+    }
+    # Default parameters for each task
+    DEFAULT_PARAMS = {
+        "research": {
+            "temperature": 0.3,
+            "max_tokens": 300,
+            "top_p": 0.95,
+        },
+        "verification": {
+            "temperature": 0.0,
+            "max_tokens": 200,
+            "top_p": 0.9,
+        },
+        "relevance": {
+            "temperature": 0.0,
+            "max_tokens": 10,
+            "top_p": 0.9,
+        }
+    }
+    def __init__(self, provider: ModelProvider = ModelProvider.GOOGLE):
+        """
+        Initialize LLM configuration with specified provider
+        Args:
+            provider: Model provider to use (default: Google)
+        """
+        self.provider = provider
+        self.api_key = self._get_api_key()
+        self._validate_config()
+    def _get_api_key(self) -> str:
+        """Get API key for the configured provider"""
+        if self.provider == ModelProvider.GOOGLE:
+            key = os.getenv("GOOGLE_API_KEY")
+            if not key:
+                raise ValueError("GOOGLE_API_KEY environment variable is required")
+            return key
+        elif self.provider == ModelProvider.OPENAI:
+            key = os.getenv("OPENAI_API_KEY")
+            if not key:
+                raise ValueError("OPENAI_API_KEY environment variable is required")
+            return key
+        else:
+            raise ValueError(f"Unsupported provider: {self.provider}")
+    def _validate_config(self):
+        """Validate configuration"""
+        if self.provider not in self.MODELS:
+            raise ValueError(f"Provider {self.provider} not configured")
+    def get_model_name(self, task: str) -> str:
+        """Get model name for specific task"""
+        if task not in self.MODELS[self.provider]:
+            raise ValueError(f"Task {task} not configured for provider {self.provider}")
+        return self.MODELS[self.provider][task]
+    def get_model_params(self, task: str) -> Dict[str, Any]:
+        """Get model parameters for specific task"""
+        return self.DEFAULT_PARAMS.get(task, {}).copy()
+    def create_llm(self, task: str):
+        """Create LLM instance for specific task"""
+        model_name = self.get_model_name(task)
+        params = self.get_model_params(task)
+        if self.provider == ModelProvider.GOOGLE:
+            return ChatGoogleGenerativeAI(
+                model=model_name,
+                google_api_key=self.api_key,
+                temperature=params.get("temperature", 0.3),
+                max_tokens=params.get("max_tokens", None),
+                top_p=params.get("top_p", 0.95),
+            )
+        elif self.provider == ModelProvider.OPENAI:
+            # Would use ChatOpenAI here
+            pass
+        raise ValueError(f"Provider {self.provider} not implemented")
+    def create_embedding(self):
+        """Create embedding instance"""
+        if self.provider == ModelProvider.GOOGLE:
+            return GoogleGenerativeAIEmbeddings(
+                model="models/text-embedding-004",
+                google_api_key=self.api_key
+            )
+        elif self.provider == ModelProvider.OPENAI:
+            # Would use OpenAIEmbeddings here
+            pass
+        raise ValueError(f"Provider {self.provider} not implemented")
+    def create_direct_client(self):
+        """Create direct client for providers that need it"""
+        if self.provider == ModelProvider.GOOGLE:
+            client = genai.Client(api_key=self.api_key)
+            return client
+        return None
+# Global configuration instance
+llm_config = LLMConfig()

config/settings.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from pydantic_settings import BaseSettings
+from .constants import MAX_FILE_SIZE, MAX_TOTAL_SIZE, ALLOWED_TYPES
+import os
+class Settings(BaseSettings):
+    # LLM Provider settings
+    LLM_PROVIDER: str = "google"  # "google" or "openai"
+    # API Keys
+    GOOGLE_API_KEY: str
+    OPENAI_API_KEY: str = ""
+    # Optional settings with defaults
+    MAX_FILE_SIZE: int = MAX_FILE_SIZE
+    MAX_TOTAL_SIZE: int = MAX_TOTAL_SIZE
+    ALLOWED_TYPES: list = ALLOWED_TYPES
+    # Database settings
+    CHROMA_DB_PATH: str = "./chroma_db"
+    CHROMA_COLLECTION_NAME: str = "documents"
+    # Retrieval settings
+    VECTOR_SEARCH_K: int = 10
+    HYBRID_RETRIEVER_WEIGHTS: list = [0.4, 0.6]
+    # Logging settings
+    LOG_LEVEL: str = "INFO"
+    # Cache settings
+    CACHE_DIR: str = "document_cache"
+    CACHE_EXPIRE_DAYS: int = 7
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+settings = Settings()

config/test.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+Test file for LLMConfig
+Run: python test_llm_config.py
+"""
+from llm_config import LLMConfig, ModelProvider
+def test_basic_config():
+    print("🔹 Testing basic configuration...")
+    config = LLMConfig(provider=ModelProvider.GOOGLE)
+    print("Provider:", config.provider.value)
+    print("API Key loaded: ✅")
+def test_model_names():
+    print("\n🔹 Testing model name resolution...")
+    config = LLMConfig()
+    print("Research model:", config.get_model_name("research"))
+    print("Verification model:", config.get_model_name("verification"))
+    print("Relevance model:", config.get_model_name("relevance"))
+def test_llm_creation():
+    print("\n🔹 Testing LLM creation...")
+    config = LLMConfig()
+    llm = config.create_llm("research")
+    print("LLM instance created:", type(llm))
+def test_embedding_creation():
+    print("\n🔹 Testing embedding creation...")
+    config = LLMConfig()
+    embedding = config.create_embedding()
+    print("Embedding instance created:", type(embedding))
+def test_direct_client():
+    print("\n🔹 Testing direct Gemini client...")
+    config = LLMConfig()
+    client = config.create_direct_client()
+    print("Direct client created:", type(client))
+if __name__ == "__main__":
+    try:
+        test_basic_config()
+        test_model_names()
+        test_llm_creation()
+        test_embedding_creation()
+        test_direct_client()
+        print("\n✅ ALL TESTS PASSED")
+    except Exception as e:
+        print("\n❌ TEST FAILED")
+        print("Error:", e)

document_processor/__init.py__ ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .file_handler import DocumentProcessor
2	+
3	+ __all__ = ["DocumentProcessor"]

document_processor/file_handler.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import hashlib
+import pickle
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import List
+from docling.document_converter import DocumentConverter
+from langchain_text_splitters import MarkdownHeaderTextSplitter
+from config import constants
+from config.settings import settings
+from utils.logging import logger
+class DocumentProcessor:
+    def __init__(self):
+        self.headers = [("#", "Header 1"), ("##", "Header 2")]
+        self.cache_dir = Path(settings.CACHE_DIR)
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+    def validate_files(self, files: List) -> None:
+        """Validate the total size of the uploaded files."""
+        total_size = sum(os.path.getsize(f.name) for f in files)
+        if total_size > constants.MAX_TOTAL_SIZE:
+            raise ValueError(f"Total size exceeds {constants.MAX_TOTAL_SIZE//1024//1024}MB limit")
+    def process(self, files: List) -> List:
+        """Process files with caching for subsequent queries"""
+        self.validate_files(files)
+        all_chunks = []
+        seen_hashes = set()
+        for file in files:
+            try:
+                # Generate content-based hash for caching
+                with open(file.name, "rb") as f:
+                    file_hash = self._generate_hash(f.read())
+                cache_path = self.cache_dir / f"{file_hash}.pkl"
+                if self._is_cache_valid(cache_path):
+                    logger.info(f"Loading from cache: {file.name}")
+                    chunks = self._load_from_cache(cache_path)
+                else:
+                    logger.info(f"Processing and caching: {file.name}")
+                    chunks = self._process_file(file)
+                    self._save_to_cache(chunks, cache_path)
+                # Deduplicate chunks across files
+                for chunk in chunks:
+                    chunk_hash = self._generate_hash(chunk.page_content.encode())
+                    if chunk_hash not in seen_hashes:
+                        all_chunks.append(chunk)
+                        seen_hashes.add(chunk_hash)
+            except Exception as e:
+                logger.error(f"Failed to process {file.name}: {str(e)}")
+                continue
+        logger.info(f"Total unique chunks: {len(all_chunks)}")
+        return all_chunks
+    def _process_file(self, file) -> List:
+        """Original processing logic with Docling"""
+        if not file.name.endswith(('.pdf', '.docx', '.txt', '.md')):
+            logger.warning(f"Skipping unsupported file type: {file.name}")
+            return []
+        converter = DocumentConverter()
+        markdown = converter.convert(file.name).document.export_to_markdown()
+        splitter = MarkdownHeaderTextSplitter(self.headers)
+        return splitter.split_text(markdown)
+    def _generate_hash(self, content: bytes) -> str:
+        return hashlib.sha256(content).hexdigest()
+    def _save_to_cache(self, chunks: List, cache_path: Path):
+        with open(cache_path, "wb") as f:
+            pickle.dump({
+                "timestamp": datetime.now().timestamp(),
+                "chunks": chunks
+            }, f)
+    def _load_from_cache(self, cache_path: Path) -> List:
+        with open(cache_path, "rb") as f:
+            data = pickle.load(f)
+        return data["chunks"]
+    def _is_cache_valid(self, cache_path: Path) -> bool:
+        if not cache_path.exists():
+            return False
+        cache_age = datetime.now() - datetime.fromtimestamp(cache_path.stat().st_mtime)
+        return cache_age < timedelta(days=settings.CACHE_EXPIRE_DAYS)

requirements.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+# Core Python
+python-dotenv==1.0.1
+pydantic==2.10.6
+pydantic-settings==2.7.1
+typing-extensions==4.12.2
+# Web Framework
+fastapi==0.115.7
+uvicorn[standard]==0.34.0
+gradio==5.13.2
+# LangChain Core
+langchain==0.3.16
+langchain-core==0.3.32
+langchain-community==0.3.16
+langchain-text-splitters==0.3.5
+langgraph==0.2.68
+# LLM Providers
+langchain-google-genai==2.1.2
+google-generativeai==0.8.4
+langchain-openai==0.3.2
+openai==1.60.2
+# Embeddings & Vector Stores
+chromadb==0.6.3
+langchain-chroma==0.2.4
+sentence-transformers==3.0.1
+# Document Processing
+docling==2.15.0
+pypdf==5.2.0
+python-docx==1.1.2
+markdown==3.6
+beautifulsoup4==4.12.3
+lxml==5.3.0
+# Text Processing & Retrieval
+rank-bm25==0.2.2
+nltk==3.9.1
+scikit-learn==1.6.0
+numpy==1.26.4
+# Caching & Hashing
+cachetools==5.5.1
+# Logging
+loguru==0.7.3
+# Utilities
+python-multipart==0.0.20
+aiofiles==23.2.1
+pillow==10.4.0
+tqdm==4.67.1
+tenacity==9.0.0
+backoff==2.2.1
+httpx==0.28.1
+requests==2.32.3
+orjson==3.10.15
+# Development & Testing
+pytest==8.3.4
+pytest-asyncio==0.23.7
+black==24.10.0
+isort==5.13.2
+mypy==1.13.0
+ruff==0.9.3

retriever/__init.py__ ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .builder import RetrieverBuilder
2	+
3	+ __all__ = ["RetrieverBuilder"]

retriever/builder.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from langchain_community.vectorstores import Chroma
+from langchain_community.retrievers import BM25Retriever
+from langchain.retrievers import EnsembleRetriever
+from config.settings import settings
+from config.llm_config import llm_config
+import logging
+logger = logging.getLogger(__name__)
+class RetrieverBuilder:
+    def __init__(self):
+        """Initialize the retriever builder with embeddings."""
+        logger.info("Initializing RetrieverBuilder...")
+        # Get embeddings from configuration
+        self.embeddings = llm_config.create_embedding()
+        logger.info("RetrieverBuilder initialized successfully.")
+    def build_hybrid_retriever(self, docs):
+        """Build a hybrid retriever using BM25 and vector-based retrieval."""
+        try:
+            logger.info(f"Building hybrid retriever with {len(docs)} documents")
+            # Create Chroma vector store
+            vector_store = Chroma.from_documents(
+                documents=docs,
+                embedding=self.embeddings,
+                persist_directory=settings.CHROMA_DB_PATH,
+                collection_name=settings.CHROMA_COLLECTION_NAME
+            )
+            logger.info("Vector store created successfully.")
+            # Create BM25 retriever
+            bm25 = BM25Retriever.from_documents(docs)
+            logger.info("BM25 retriever created successfully.")
+            # Create vector-based retriever
+            vector_retriever = vector_store.as_retriever(
+                search_kwargs={"k": settings.VECTOR_SEARCH_K}
+            )
+            logger.info("Vector retriever created successfully.")
+            # Combine retrievers into a hybrid retriever
+            hybrid_retriever = EnsembleRetriever(
+                retrievers=[bm25, vector_retriever],
+                weights=settings.HYBRID_RETRIEVER_WEIGHTS
+            )
+            logger.info("Hybrid retriever created successfully.")
+            return hybrid_retriever
+        except Exception as e:
+            logger.error(f"Failed to build hybrid retriever: {e}")
+            raise

utils/__init.py__ ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .logging import logger
2	+
3	+ __all__ = ["logger"]

utils/logging.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from loguru import logger
+logger.add(
+    "app.log",
+    rotation="10 MB",
+    retention="30 days",
+    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
+)