Spaces:

hasnanmr
/

agentic_summarizer

Sleeping

App Files Files Community

Hasnan Ramadhan commited on Jul 8, 2025

Commit

5e3f3a0

1 Parent(s): 9258d67

refactor to hybrid retrieval

Browse files

Files changed (2) hide show

app.py +324 -398
hybrid_retriever.py +139 -0

app.py CHANGED Viewed

@@ -1,458 +1,393 @@
 import gradio as gr
-from langgraph.graph import StateGraph
-from typing import TypedDict
 from langchain_community.document_loaders import PyMuPDFLoader
-import requests
 from groq import Groq
 import os
 from dotenv import load_dotenv
 import tempfile
-from googlesearch import search
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-import re
 load_dotenv()
 # Check if GROQ_API_KEY is available
 if not os.getenv("GROQ_API_KEY"):
     print("Warning: GROQ_API_KEY not found in environment variables")
-class DocumentState(TypedDict):
-    documents: list[dict]
-    summaries: list[str]
-    search_results: list[dict]
-    search_query: str
     needs_search: bool
-def get_llm_response(prompt):
-    url = "http://192.168.181.215:8081/llms"
-    headers = {"Content-Type": "application/json"}
-    payload = {
-        "messages": [{"role": "user", "content": prompt}],
-        "max_new_tokens": 2000,
-        "do_sample": True,
-        "temperature": 0.2,
-        "top_k": 10,
-        "top_p": 0.90
-    }
-    try:
-        response = requests.post(url, json=payload, headers=headers)
-        response.raise_for_status()
-        data = response.json()
-        return {
-            "response": data['choices'][0]['content'],
-            "usage": data.get('usage', {}),
-            "generation_time": data.get('generation_time', None)
-        }
-    except requests.exceptions.RequestException as e:
-        return {
-            "response": f"Error occurred: {str(e)}",
-            "usage": {},
-            "generation_time": None
         }
-def get_groq_response(prompt):
-    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
-    completion = client.chat.completions.create(
-        model="llama-3.1-8b-instant",
-        messages=[
             {
-                "role": "user",
-                "content": prompt
             }
-        ]
-    )
-    return completion.choices[0].message.content
-def google_search_agent(state: DocumentState) -> DocumentState:
-    """Performs Google search and extracts content from results."""
-    search_query = state.get('search_query')
-    if not search_query or not isinstance(search_query, str):
-        return state
-    try:
-        search_results = []
-        # Get top 3 search results
-        for url in search(state['search_query'], num_results=3):
-            try:
-                response = requests.get(url, timeout=10)
-                response.raise_for_status()
-                soup = BeautifulSoup(response.content, 'html.parser')
-                # Remove script and style elements
-                for script in soup(["script", "style"]):
-                    script.decompose()
-                # Get text content
-                text = soup.get_text()
-                # Clean up text
-                lines = (line.strip() for line in text.splitlines())
-                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-                text = ' '.join(chunk for chunk in chunks if chunk)
-                # Limit text length
-                if len(text) > 1000:
-                    text = text[:1000] + "..."
-                search_results.append({
-                    'url': url,
-                    'content': text,
-                    'title': soup.title.string if soup.title else "No title"
-                })
-            except Exception as e:
-                print(f"Error scraping {url}: {e}")
-                continue
-        state['search_results'] = search_results
-    except Exception as e:
-        print(f"Error during search: {e}")
-        state['search_results'] = []
-    return state
-def search_analyzer_agent(state: DocumentState) -> DocumentState:
-    """Analyzes user query to determine if web search is needed."""
-    search_query = state.get('search_query')
-    if not search_query or not isinstance(search_query, str):
-        return state
-    # Keywords that typically indicate need for current information
-    search_indicators = [
-        'latest', 'recent', 'current', 'news', 'update', 'today', 'now',
-        'what is', 'who is', 'when did', 'where is', 'how to', 'definition',
-        'explain', 'information about', 'tell me about', 'research'
-    ]
-    query_lower = search_query.lower()
-    state['needs_search'] = any(indicator in query_lower for indicator in search_indicators)
-    return state
-def search_response_agent(state: DocumentState) -> DocumentState:
-    """Generates response based on search results."""
-    search_results = state.get('search_results')
-    search_query = state.get('search_query')
-    if not search_results or not isinstance(search_results, list):
-        # Fallback to regular LLM response
-        if search_query and isinstance(search_query, str):
-            response = get_groq_response(search_query)
-            state['summaries'] = [response]
-        return state
-    # Prepare search results for LLM
-    search_context = "\n\n".join([
-        f"Source: {result['title']} ({result['url']})\nContent: {result['content']}"
-        for result in search_results
-    ])
-    prompt = f"""Based on the following search results, provide a comprehensive and accurate answer to the user's question: "{search_query}"
-Search Results:
-{search_context}
-Please provide a well-structured response that:
-1. Answers the user's question directly
-2. Cites the sources when relevant
-3. Is accurate and informative
-4. Is concise but comprehensive
-Response:"""
-    response = get_groq_response(prompt)
-    state['summaries'] = [response]
-    return state
-def document_extractor_agent(state: DocumentState, pdf_path: str) -> DocumentState:
-    """Extracts documents from a PDF file."""
     try:
         loader = PyMuPDFLoader(pdf_path)
         documents = loader.load()
-        state['documents'] = [
-            {
-                'content': doc.page_content,
-                'page': doc.metadata.get('page', 0) + 1,
-                'source': doc.metadata.get('source', 'Unknown')
-            } for doc in documents
-        ]
     except Exception as e:
-        print(f"Error loading PDF: {e}")
-        state['documents'] = []
-    return state
-def document_summarizer_agent(state: DocumentState) -> DocumentState:
-    """Retrieves summaries of the documents."""
-    truncated_docs = []
-    for doc in state['documents']:
-        content = doc['content'][:500]
-        truncated_docs.append(f"Page {doc['page']}: {content}")
-    prompt = f"""Summarize these documents in exactly 3 sentences. Include page citations (p. X).
-Documents:
-{chr(10).join(truncated_docs)}
-Write 3 sentences with page citations with only refer from the document don't add up and jump to the conclusion."""
-    summary = get_groq_response(prompt)
-    state['summaries'] = [summary]
-    return state
-def create_document_graph():
-    talking_documents = StateGraph(DocumentState)
-    talking_documents.add_node('document_extractor', document_extractor_agent)
-    talking_documents.add_node('document_summarizer', document_summarizer_agent)
-    talking_documents.set_entry_point('document_extractor')
-    talking_documents.add_edge('document_extractor', 'document_summarizer')
-    return talking_documents.compile()
-def create_search_graph():
-    search_workflow = StateGraph(DocumentState)
-    search_workflow.add_node('search_analyzer', search_analyzer_agent)
-    search_workflow.add_node('google_search', google_search_agent)
-    search_workflow.add_node('search_response', search_response_agent)
-    search_workflow.set_entry_point('search_analyzer')
-    # Conditional edge based on search needs
-    def should_search(state):
-        return "search" if state.get('needs_search', False) else "response"
-    search_workflow.add_conditional_edges(
-        'search_analyzer',
-        should_search,
-        {
-            "search": "google_search",
-            "response": "search_response"
-        }
-    )
-    search_workflow.add_edge('google_search', 'search_response')
-    return search_workflow.compile()
-def process_pdf_and_chat(pdf_file, message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
     if pdf_file is None:
-        return history + [(message, "Please upload a PDF file first.")]
     try:
-        # Handle file path - in newer Gradio versions, pdf_file is already a path
         if isinstance(pdf_file, str):
-            tmp_pdf_path = pdf_file
-            cleanup_needed = False
         else:
             # For older versions where pdf_file is a file object
             with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                 tmp_file.write(pdf_file.read())
-                tmp_pdf_path = tmp_file.name
-                cleanup_needed = True
-        # Check if user wants to search for additional information
-        search_keywords = ['search', 'find more', 'additional info', 'more information', 'research']
-        if enable_search and any(keyword in message.lower() for keyword in search_keywords):
-            # Use search workflow for additional information
-            search_graph = create_search_graph()
-            search_state = {
-                'documents': [],
-                'summaries': [],
-                'search_results': [],
-                'search_query': message,
-                'needs_search': True
-            }
-            search_result = search_graph.invoke(search_state)
-            # Also process the PDF
-            def document_extractor_with_path(state: DocumentState) -> DocumentState:
-                return document_extractor_agent(state, tmp_pdf_path)
-            talking_documents = StateGraph(DocumentState)
-            talking_documents.add_node('document_extractor', document_extractor_with_path)
-            talking_documents.add_node('document_summarizer', document_summarizer_agent)
-            talking_documents.set_entry_point('document_extractor')
-            talking_documents.add_edge('document_extractor', 'document_summarizer')
-            pdf_graph = talking_documents.compile()
-            pdf_state = {'documents': [], 'summaries': []}
-            pdf_result = pdf_graph.invoke(pdf_state)
-            # Combine PDF and search results
-            combined_response = f"**PDF Summary:**\n{pdf_result['summaries'][0] if pdf_result['summaries'] else 'No summary available'}\n\n**Additional Information from Web:**\n{search_result['summaries'][0] if search_result['summaries'] else 'No additional information found'}"
-            response = combined_response
-        else:
-            # Regular PDF processing
-            def document_extractor_with_path(state: DocumentState) -> DocumentState:
-                return document_extractor_agent(state, tmp_pdf_path)
-            talking_documents = StateGraph(DocumentState)
-            talking_documents.add_node('document_extractor', document_extractor_with_path)
-            talking_documents.add_node('document_summarizer', document_summarizer_agent)
-            talking_documents.set_entry_point('document_extractor')
-            talking_documents.add_edge('document_extractor', 'document_summarizer')
-            graph = talking_documents.compile()
-            state = {'documents': [], 'summaries': []}
-            final_state = graph.invoke(state)
-            if final_state['summaries']:
-                response = final_state['summaries'][0]
-            else:
-                response = "Unable to process the PDF. Please check the file format."
-        # Clean up temporary file only if we created it
-        if cleanup_needed:
-            os.unlink(tmp_pdf_path)
-        return history + [(message, response)]
     except Exception as e:
-        return history + [(message, f"Error processing PDF: {str(e)}")]
-def respond_messages(message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
-    """Enhanced chat function with optional Google search - returns just the response text"""
-    if enable_search:
-        # Use search workflow
-        search_graph = create_search_graph()
-        state = {
-            'documents': [],
-            'summaries': [],
-            'search_results': [],
-            'search_query': message,
-            'needs_search': False
-        }
-        final_state = search_graph.invoke(state)
-        if final_state['summaries']:
-            response = final_state['summaries'][0]
-        else:
-            # Fallback to regular LLM response
-            prompt = f"{system_message}\n\nUser: {message}"
-            response = get_groq_response(prompt)
-    else:
-        # Regular chat without search
-        prompt = f"{system_message}\n\nUser: {message}"
-        response = get_groq_response(prompt)
-    return response
-def process_pdf_and_chat_messages(pdf_file, message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
-    """Enhanced PDF processing function - returns just the response text"""
     if pdf_file is None:
-        return "Please upload a PDF file first."
     try:
-        # Handle file path - in newer Gradio versions, pdf_file is already a path
         if isinstance(pdf_file, str):
-            tmp_pdf_path = pdf_file
-            cleanup_needed = False
         else:
-            # For older versions where pdf_file is a file object
             with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                 tmp_file.write(pdf_file.read())
-                tmp_pdf_path = tmp_file.name
-                cleanup_needed = True
-        # Check if user wants to search for additional information
-        search_keywords = ['search', 'find more', 'additional info', 'more information', 'research']
-        if enable_search and any(keyword in message.lower() for keyword in search_keywords):
-            # Use search workflow for additional information
-            search_graph = create_search_graph()
-            search_state = {
-                'documents': [],
-                'summaries': [],
-                'search_results': [],
-                'search_query': message,
-                'needs_search': True
-            }
-            search_result = search_graph.invoke(search_state)
-            # Also process the PDF
-            def document_extractor_with_path(state: DocumentState) -> DocumentState:
-                return document_extractor_agent(state, tmp_pdf_path)
-            talking_documents = StateGraph(DocumentState)
-            talking_documents.add_node('document_extractor', document_extractor_with_path)
-            talking_documents.add_node('document_summarizer', document_summarizer_agent)
-            talking_documents.set_entry_point('document_extractor')
-            talking_documents.add_edge('document_extractor', 'document_summarizer')
-            pdf_graph = talking_documents.compile()
-            pdf_state = {'documents': [], 'summaries': []}
-            pdf_result = pdf_graph.invoke(pdf_state)
-            # Combine PDF and search results
-            combined_response = f"**PDF Summary:**\n{pdf_result['summaries'][0] if pdf_result['summaries'] else 'No summary available'}\n\n**Additional Information from Web:**\n{search_result['summaries'][0] if search_result['summaries'] else 'No additional information found'}"
-            response = combined_response
-        else:
-            # Regular PDF processing
-            def document_extractor_with_path(state: DocumentState) -> DocumentState:
-                return document_extractor_agent(state, tmp_pdf_path)
-            talking_documents = StateGraph(DocumentState)
-            talking_documents.add_node('document_extractor', document_extractor_with_path)
-            talking_documents.add_node('document_summarizer', document_summarizer_agent)
-            talking_documents.set_entry_point('document_extractor')
-            talking_documents.add_edge('document_extractor', 'document_summarizer')
-            graph = talking_documents.compile()
-            state = {'documents': [], 'summaries': []}
-            final_state = graph.invoke(state)
-            if final_state['summaries']:
-                response = final_state['summaries'][0]
-            else:
-                response = "Unable to process the PDF. Please check the file format."
-        # Clean up temporary file only if we created it
-        if cleanup_needed:
-            os.unlink(tmp_pdf_path)
-        return response
     except Exception as e:
-        return f"Error processing PDF: {str(e)}"
-def respond(message, history, system_message, max_tokens, temperature, top_p, enable_search=False):
-    """Enhanced chat function with optional Google search"""
-    if enable_search:
-        # Use search workflow
-        search_graph = create_search_graph()
-        state = {
-            'documents': [],
-            'summaries': [],
-            'search_results': [],
-            'search_query': message,
-            'needs_search': False
-        }
-        final_state = search_graph.invoke(state)
-        if final_state['summaries']:
-            response = final_state['summaries'][0]
-        else:
-            # Fallback to regular LLM response
-            prompt = f"{system_message}\n\nUser: {message}"
-            response = get_groq_response(prompt)
-    else:
-        # Regular chat without search
-        prompt = f"{system_message}\n\nUser: {message}"
-        response = get_groq_response(prompt)
-    return history + [(message, response)]
 # Create the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Document Summarizer with Web Search")
-    gr.Markdown("Upload a PDF document and ask questions about it, or chat normally. Enable search for additional web information.")
     with gr.Row():
         with gr.Column(scale=1):
             pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
-            enable_search = gr.Checkbox(label="Enable Google Search", value=False)
             system_message = gr.Textbox(
                 value="You are a helpful assistant for summarizing and finding related information needed.",
                 label="System message"
@@ -469,29 +404,20 @@ with gr.Blocks() as demo:
     def user_input(message, history):
         return "", history + [{"role": "user", "content": message}]
-    def bot_response(history, pdf_file, enable_search, system_message, max_tokens, temperature, top_p):
         message = history[-1]["content"]
         if pdf_file is not None:
-            response = process_pdf_and_chat_messages(pdf_file, message, history[:-1], system_message, max_tokens, temperature, top_p, enable_search)
         else:
-            response = respond_messages(message, history[:-1], system_message, max_tokens, temperature, top_p, enable_search)
         return history[:-1] + [{"role": "user", "content": message}, {"role": "assistant", "content": response}]
-    def auto_summarize_pdf(pdf_file):
-        """Automatically summarize PDF when uploaded"""
-        if pdf_file is None:
-            return []
-        # Trigger automatic summarization
-        response = process_pdf_and_chat_messages(pdf_file, "Please provide a summary of this document", [], "You are a helpful assistant for summarizing documents.", 512, 0.7, 0.95, False)
-        return [{"role": "assistant", "content": response}]
     msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot_response, [chatbot, pdf_upload, enable_search, system_message, max_tokens, temperature, top_p], chatbot
     )
     clear.click(lambda: None, None, chatbot, queue=False)
-    # Auto-summarize when PDF is uploaded
     pdf_upload.upload(auto_summarize_pdf, [pdf_upload], [chatbot])
 if __name__ == "__main__":

 import gradio as gr
+from langgraph.graph import StateGraph, START, END
+from typing import TypedDict, List, Union, Dict, Any, Annotated
 from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from hybrid_retriever import build_hybrid_retriever
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
+from langchain_core.documents import Document
 from groq import Groq
 import os
 from dotenv import load_dotenv
 import tempfile
+import time
+import logging
+from operator import add
 load_dotenv()
 # Check if GROQ_API_KEY is available
 if not os.getenv("GROQ_API_KEY"):
     print("Warning: GROQ_API_KEY not found in environment variables")
+def add_messages(left, right):
+    """Helper function to add messages"""
+    return left + right
+class AgentState(TypedDict):
+    messages: Annotated[List[Union[HumanMessage, AIMessage, ToolMessage]], add_messages]
+    query: str
+    documents: List[str]
+    final_answer: str
     needs_search: bool
+    search_count: int
+    metrics: Dict[str, Any]
+class ResponseTimeTracker:
+    def __init__(self):
+        self.metrics = {
+            "retrieval_time": 0,
+            "llm_processing_time": 0,
+            "total_time": 0
         }
+    def update_retrieval_metrics(self, retrieval_metrics):
+        self.metrics.update(retrieval_metrics)
+    def get_metrics_dict(self):
+        return self.metrics
+class CustomAgentExecutor:
+    def __init__(self, retriever):
+        self.retriever = retriever
+        self.groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+        self.response_tracker = ResponseTimeTracker()
+        self.max_searches = 3
+        # Create LangGraph workflow
+        self.workflow = self._create_workflow()
+    def _create_workflow(self):
+        """Create LangGraph workflow"""
+        workflow = StateGraph(AgentState)
+        # Add nodes
+        workflow.add_node("search", self._search_node)
+        workflow.add_node("generate", self._generate_node)
+        workflow.add_node("decide", self._decide_node)
+        # Add edges
+        workflow.add_edge(START, "search")
+        workflow.add_edge("search", "decide")
+        workflow.add_conditional_edges(
+            "decide",
+            self._should_continue,
             {
+                "search": "search",
+                "generate": "generate",
+                "end": END
             }
+        )
+        workflow.add_edge("generate", END)
+        return workflow.compile()
+    def _search_node(self, state: AgentState) -> AgentState:
+        """Node for document retrieval"""
+        query = state.get("query", "")
+        search_count = state.get("search_count", 0)
+        # Perform retrieval
+        retrieval_start = time.time()
+        try:
+            docs = self.retriever.get_relevant_documents(query)
+            retrieval_time = time.time() - retrieval_start
+            self.response_tracker.metrics["retrieval_time"] = retrieval_time
+        except Exception as e:
+            logging.error(f"Retrieval error: {e}")
+            docs = []
+            retrieval_time = time.time() - retrieval_start
+            self.response_tracker.metrics["retrieval_time"] = retrieval_time
+        # Format documents
+        formatted_docs = []
+        if docs:
+            for i, doc in enumerate(docs, 1):
+                ref = f"[Doc {i}]"
+                content = doc.page_content.strip()
+                formatted_docs.append(f"{ref} {content}")
+        else:
+            formatted_docs = ["No relevant information found in the knowledge base."]
+        return {
+            **state,
+            "documents": formatted_docs,
+            "search_count": search_count + 1,
+            "needs_search": False
+        }
+    def _decide_node(self, state: AgentState) -> AgentState:
+        """Node to decide next action"""
+        documents = state.get("documents", [])
+        search_count = state.get("search_count", 0)
+        # Simple decision logic
+        if not documents or documents == ["No relevant information found in the knowledge base."]:
+            if search_count < self.max_searches:
+                return {**state, "needs_search": True}
+            else:
+                return {**state, "needs_search": False, "final_answer": "I don't have the knowledge."}
+        else:
+            return {**state, "needs_search": False}
+    def _generate_node(self, state: AgentState) -> AgentState:
+        """Node for LLM response generation"""
+        query = state.get("query", "")
+        documents = state.get("documents", [])
+        # Create prompt with documents
+        doc_context = "\n\n".join(documents)
+        system_prompt = (
+            "You are a helpful assistant that answers questions based only on the provided documents. "
+            "Each passage is tagged with a source like [Doc 1], [Doc 2], etc. "
+            "When answering, cite the relevant document(s) using these tags. "
+            "You are prohibited from using your past knowledge. "
+            "When the answer is not directly explained in the document(s), you MUST answer with 'I don't have the knowledge'."
+        )
+        user_prompt = f"Context:\n{doc_context}\n\nQuestion: {query}\n\nAnswer:"
+        # Generate response using Groq
+        llm_start = time.time()
+        try:
+            response = self.groq_client.chat.completions.create(
+                model="llama-3.1-8b-instant",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ]
+            )
+            llm_time = time.time() - llm_start
+            self.response_tracker.metrics["llm_processing_time"] = llm_time
+            response_content = response.choices[0].message.content
+            return {
+                **state,
+                "final_answer": response_content,
+                "messages": state.get("messages", []) + [
+                    HumanMessage(content=query),
+                    AIMessage(content=response_content)
+                ]
+            }
+        except Exception as e:
+            llm_time = time.time() - llm_start
+            self.response_tracker.metrics["llm_processing_time"] = llm_time
+            error_msg = f"LLM generation error: {str(e)}"
+            logging.error(f"LLM error: {e}", exc_info=True)
+            return {
+                **state,
+                "final_answer": error_msg,
+                "messages": state.get("messages", []) + [
+                    HumanMessage(content=query),
+                    AIMessage(content=error_msg)
+                ]
+            }
+    def _should_continue(self, state: AgentState) -> str:
+        """Determine next step in workflow"""
+        if state.get("needs_search", False):
+            return "search"
+        elif state.get("final_answer"):
+            return "end"
+        else:
+            return "generate"
+    def get_last_response_metrics(self) -> Dict[str, Any]:
+        """Get the metrics from the last query response"""
+        return self.response_tracker.get_metrics_dict()
+    def query(self, question: str) -> str:
+        """Main query method"""
+        initial_state = {
+            "messages": [],
+            "query": question,
+            "documents": [],
+            "final_answer": "",
+            "needs_search": False,
+            "search_count": 0,
+            "metrics": {}
+        }
+        total_start = time.time()
+        try:
+            final_state = self.workflow.invoke(initial_state)
+            total_time = time.time() - total_start
+            self.response_tracker.metrics["total_time"] = total_time
+            return final_state.get("final_answer", "No answer generated")
+        except Exception as e:
+            total_time = time.time() - total_start
+            self.response_tracker.metrics["total_time"] = total_time
+            logging.error(f"Query processing error: {e}")
+            return f"Error processing query: {str(e)}"
+# Global variables for RAG system
+vector_store = None
+agent_executor = None
+def create_vector_store(pdf_path: str):
+    """Create vector store from PDF documents"""
+    global vector_store, agent_executor
     try:
+        # Load PDF documents
         loader = PyMuPDFLoader(pdf_path)
         documents = loader.load()
+        # Split documents into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200,
+            length_function=len
+        )
+        chunks = text_splitter.split_documents(documents)
+        # Create embeddings
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        # Extract texts for sparse retrieval
+        texts = [doc.page_content for doc in chunks]
+        # Build hybrid retriever
+        hybrid_retriever = build_hybrid_retriever(
+            texts=texts,
+            index_name="document_index",
+            embedding=embeddings,
+            es_url="http://localhost:9200",
+            es_username="elastic",
+            es_password=os.getenv("ELASTIC_PASSWORD", ""),
+            top_k_dense=5,
+            top_k_sparse=5
+        )
+        # Add documents to the hybrid retriever
+        hybrid_retriever.add_documents(chunks)
+        # Store the hybrid retriever
+        vector_store = hybrid_retriever
+        # Create agent executor
+        agent_executor = CustomAgentExecutor(hybrid_retriever)
+        return True
     except Exception as e:
+        logging.error(f"Error creating vector store: {e}")
+        return False
+def get_groq_response(prompt):
+    """Get response from Groq API"""
+    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
+    completion = client.chat.completions.create(
+        model="llama-3.1-8b-instant",
+        messages=[
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ]
+    )
+    return completion.choices[0].message.content
+def summarize_document(pdf_path: str) -> str:
+    """Summarize the uploaded document"""
+    try:
+        loader = PyMuPDFLoader(pdf_path)
+        documents = loader.load()
+        # Create a summary of the document
+        full_text = "\n\n".join([doc.page_content[:1000] for doc in documents[:5]])  # First 5 pages
+        prompt = f"""Summarize the following document in exactly 3 sentences. Include page references where relevant.
+Document content:
+{full_text}
+Write 3 sentences that capture the main points of the document."""
+        return get_groq_response(prompt)
+    except Exception as e:
+        return f"Error summarizing document: {str(e)}"
+def process_pdf_and_chat_messages(pdf_file, message, history, system_message, max_tokens, temperature, top_p):
+    """Process PDF and handle chat with RAG system"""
+    global agent_executor
     if pdf_file is None:
+        return "Please upload a PDF file first."
     try:
+        # Handle file path
         if isinstance(pdf_file, str):
+            pdf_path = pdf_file
         else:
             # For older versions where pdf_file is a file object
             with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                 tmp_file.write(pdf_file.read())
+                pdf_path = tmp_file.name
+        # Create vector store if not exists or if it's a new file
+        if agent_executor is None:
+            success = create_vector_store(pdf_path)
+            if not success:
+                return "Error processing PDF for RAG system."
+        # Use RAG system to answer the question
+        if agent_executor:
+            response = agent_executor.query(message)
+        else:
+            response = "RAG system not initialized. Please try uploading the PDF again."
+        return response
     except Exception as e:
+        return f"Error processing PDF: {str(e)}"
+def respond_messages(message, history, system_message, max_tokens, temperature, top_p):
+    """Handle chat without PDF using regular Groq response"""
+    prompt = f"{system_message}\n\nUser: {message}"
+    return get_groq_response(prompt)
+def auto_summarize_pdf(pdf_file):
+    """Automatically summarize PDF when uploaded and create vector store"""
+    global agent_executor
     if pdf_file is None:
+        return []
     try:
+        # Handle file path
         if isinstance(pdf_file, str):
+            pdf_path = pdf_file
         else:
             with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                 tmp_file.write(pdf_file.read())
+                pdf_path = tmp_file.name
+        # Create vector store for RAG
+        success = create_vector_store(pdf_path)
+        if not success:
+            return [{"role": "assistant", "content": "Error processing PDF for RAG system."}]
+        # Generate summary
+        summary = summarize_document(pdf_path)
+        return [{"role": "assistant", "content": f"**Document Summary:**\n{summary}\n\n*The document has been processed and is ready for questions using RAG system.*"}]
     except Exception as e:
+        return [{"role": "assistant", "content": f"Error processing PDF: {str(e)}"}]
 # Create the Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Document Summarizer with RAG")
+    gr.Markdown("Upload a PDF document to get an automatic summary and ask questions using Retrieval-Augmented Generation (RAG).")
     with gr.Row():
         with gr.Column(scale=1):
             pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
             system_message = gr.Textbox(
                 value="You are a helpful assistant for summarizing and finding related information needed.",
                 label="System message"
     def user_input(message, history):
         return "", history + [{"role": "user", "content": message}]
+    def bot_response(history, pdf_file, system_message, max_tokens, temperature, top_p):
         message = history[-1]["content"]
         if pdf_file is not None:
+            response = process_pdf_and_chat_messages(pdf_file, message, history[:-1], system_message, max_tokens, temperature, top_p)
         else:
+            response = respond_messages(message, history[:-1], system_message, max_tokens, temperature, top_p)
         return history[:-1] + [{"role": "user", "content": message}, {"role": "assistant", "content": response}]
     msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot_response, [chatbot, pdf_upload, system_message, max_tokens, temperature, top_p], chatbot
     )
     clear.click(lambda: None, None, chatbot, queue=False)
+    # Auto-summarize and create vector store when PDF is uploaded
     pdf_upload.upload(auto_summarize_pdf, [pdf_upload], [chatbot])
 if __name__ == "__main__":

hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from elasticsearch import Elasticsearch
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from langchain_elasticsearch import ElasticsearchStore, BM25Strategy
+from langchain_core.vectorstores import VectorStoreRetriever
+from pydantic import Field
+from typing import List
+import logging
+class HybridRetriever(BaseRetriever):
+    dense_db: ElasticsearchStore
+    dense_retriever: VectorStoreRetriever
+    sparse_db: ElasticsearchStore
+    sparse_retriever: VectorStoreRetriever
+    index_dense: str
+    index_sparse: str
+    top_k_dense: int = 5
+    top_k_sparse: int = 5
+    is_training: bool = False
+    @classmethod
+    def create(
+        cls,
+        dense_db,
+        dense_retriever,
+        sparse_db,
+        sparse_retriever,
+        index_dense,
+        index_sparse,
+        top_k_dense=5,
+        top_k_sparse=5,
+        is_training=False,
+    ):
+        return cls(
+            dense_db=dense_db,
+            dense_retriever=dense_retriever,
+            sparse_db=sparse_db,
+            sparse_retriever=sparse_retriever,
+            index_dense=index_dense,
+            index_sparse=index_sparse,
+            top_k_dense=top_k_dense,
+            top_k_sparse=top_k_sparse,
+            is_training=is_training,
+        )
+    def reset_indices(self):
+        result = self.dense_db.client.indices.delete(
+            index=self.index_dense,
+            ignore_unavailable=True,
+            allow_no_indices=True,
+        )
+        logging.info("dense_db delete: %s", result.get("acknowledged"))
+        result = self.sparse_db.client.indices.delete(
+            index=self.index_sparse,
+            ignore_unavailable=True,
+            allow_no_indices=True,
+        )
+        logging.info("sparse_db delete: %s", result.get("acknowledged"))
+    def add_documents(self, documents, batch_size=25):
+        valid_docs = []
+        for doc in documents:
+            print(f"[DOC] {repr(doc.page_content)}")
+            if isinstance(doc, Document) and isinstance(doc.page_content, str) and doc.page_content.strip():
+                valid_docs.append(doc)
+            else:
+                logging.warning(f"Skipped invalid or empty doc: {doc}")
+        if not valid_docs:
+            raise ValueError("No valid documents to add.")
+        for i in range(0, len(valid_docs), batch_size):
+            logging.info(f"Processing batch {i}")
+            dense_batch = valid_docs[i : i + batch_size]
+            sparse_batch = [doc.page_content for doc in dense_batch]
+            self.dense_db.add_documents(dense_batch)
+            self.sparse_db.add_texts(sparse_batch)
+    def get_relevant_documents(self, query: str) -> List[Document]:
+        dense_docs = self.dense_retriever.invoke(query)
+        sparse_docs = self.sparse_retriever.invoke(query)
+        print("len dense coba docs:", len(dense_docs))
+        print("len sparse coba docs:", len(sparse_docs))
+        all_docs = dense_docs + sparse_docs
+        seen = set()
+        unique_docs = []
+        for doc in all_docs:
+            if doc.page_content not in seen:
+                seen.add(doc.page_content)
+                unique_docs.append(doc)
+        return unique_docs
+def get_elasticsearch_client(url, username=None, password=None):
+    if username and password:
+        return Elasticsearch(url, basic_auth=(username, password))
+    return Elasticsearch(url)
+def build_hybrid_retriever(texts, index_name, embedding, es_url, es_username, es_password,
+                           top_k_dense=5, top_k_sparse=5):
+    dense_index = f"{index_name}_dense"
+    sparse_index = f"{index_name}_sparse"
+    client = get_elasticsearch_client(es_url, es_username, es_password)
+    # Dense vector store
+    dense_store = ElasticsearchStore(
+        index_name=dense_index,
+        embedding=embedding,
+        es_connection=client,
+    )
+    dense_retriever = dense_store.as_retriever(search_kwargs={"k": top_k_dense})
+    # Sparse BM25 store
+    sparse_store = ElasticsearchStore.from_texts(
+        texts=[],
+        embedding=embedding,
+        index_name=sparse_index,
+        es_connection=client,
+        strategy=BM25Strategy()
+    )
+    sparse_retriever = sparse_store.as_retriever(search_kwargs={"k": top_k_sparse})
+    return HybridRetriever.create(
+        dense_db=dense_store,
+        dense_retriever=dense_retriever,
+        sparse_db=sparse_store,
+        sparse_retriever=sparse_retriever,
+        index_dense=dense_index,
+        index_sparse=sparse_index,
+        top_k_dense=top_k_dense,
+        top_k_sparse=top_k_sparse
+    )