Spaces:

Shreyas094
/

RAG_PDF

Runtime error

App Files Files Community

Shreyas094 commited on Oct 20, 2024

Commit

0b862cc

verified ·

1 Parent(s): 5f7b9cb

Update app.py

Browse files

Files changed (1) hide show

app.py +291 -76

app.py CHANGED Viewed

@@ -8,34 +8,145 @@ import os
 import logging
 import traceback
 from datetime import datetime
 # Configure logging
 logging.basicConfig(
     level=logging.DEBUG,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     handlers=[
-        logging.FileHandler(f'rag_app_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
-class RAGApplication:
-    def __init__(self, hf_api_key):
         try:
             self.hf_api_key = hf_api_key
-            self.vector_store = None
             logger.info("Initializing HuggingFace embeddings...")
             self.embeddings = HuggingFaceInferenceAPIEmbeddings(
                 api_key=hf_api_key,
                 model_name="sentence-transformers/all-MiniLM-L6-v2"
             )
             logger.info("Initializing HuggingFace client...")
             self.client = InferenceClient(api_key=hf_api_key)
             self.conversation_history = []
-            logger.info("RAGApplication initialized successfully")
         except Exception as e:
-            logger.error(f"Error initializing RAGApplication: {str(e)}")
             logger.error(f"Traceback: {traceback.format_exc()}")
             raise
@@ -46,11 +157,10 @@ class RAGApplication:
 4. Use concise language and avoid unnecessary elaboration
 5. Maintain continuity with previous conversation when relevant
-Remember:
-- Keep responses to three sentences maximum
-- Focus only on information present in the context
-- If unsure, explicitly state that the information is not in the context
-- Ensure responses are clear and directly address the question
 Context: {context}
@@ -60,91 +170,197 @@ Previous conversation:
 Question: {question}
 Answer:"""
-    def process_pdf(self, file_path):
         try:
-            logger.info(f"Starting PDF processing for file: {file_path}")
-            if file_path is None:
-                logger.warning("No file provided")
-                return "Please upload a PDF file."
-            if not os.path.exists(file_path):
-                logger.error(f"File not found at path: {file_path}")
-                return f"File not found: {file_path}"
-            # Reset conversation history when new PDF is loaded
             self.conversation_history = []
-            logger.info("Conversation history reset")
-            # Read PDF directly from the file path
-            logger.info("Reading PDF file...")
             pdf_reader = PdfReader(file_path)
-            text = ""
-            for i, page in enumerate(pdf_reader.pages):
-                try:
-                    text += page.extract_text()
-                    logger.debug(f"Extracted text from page {i+1}")
-                except Exception as e:
-                    logger.error(f"Error extracting text from page {i+1}: {str(e)}")
-            if not text.strip():
-                logger.warning("No text extracted from PDF")
-                return "No text could be extracted from the PDF. Please make sure it's not empty or scanned."
-            # Split text into chunks
-            logger.info("Splitting text into chunks...")
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=10000,
-                chunk_overlap=2000,
-                length_function=len
             )
-            chunks = text_splitter.split_text(text)
-            logger.info(f"Created {len(chunks)} chunks")
-            if not chunks:
-                logger.warning("No chunks created from text")
-                return "No chunks were created. The PDF might be empty."
-            # Create vector store
-            logger.info("Creating vector store...")
-            self.vector_store = FAISS.from_texts(chunks, self.embeddings)
-            logger.info("Vector store created successfully")
-            return "PDF processed successfully! You can now ask questions about it."
-        except Exception as e:
-            error_msg = f"Error processing PDF: {str(e)}"
-            logger.error(error_msg)
-            logger.error(f"Traceback: {traceback.format_exc()}")
-            return error_msg
-    def generate_response(self, message, history):
         try:
             logger.info(f"Generating response for message: {message}")
-            if self.vector_store is None:
-                logger.warning("No vector store available - PDF not processed")
                 return "Please upload and process a PDF first."
             query = message.strip()
             if not query:
-                logger.warning("Empty query received")
                 return "Please enter a question."
-            # Search for relevant chunks
-            logger.info("Searching for relevant chunks...")
-            relevant_chunks = self.vector_store.similarity_search(query, k=3)
-            context = "\n\n".join([doc.page_content for doc in relevant_chunks])
-            logger.debug(f"Found {len(relevant_chunks)} relevant chunks")
             # Format conversation history
-            logger.debug(f"Processing conversation history (length: {len(history)})")
             conversation_history = "\n".join([
                 f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a
             ])
-            # Create prompt with system prompt, context, and conversation history
-            logger.debug("Creating prompt...")
             prompt = self.system_prompt.format(
                 context=context,
                 conversation_history=conversation_history,
@@ -177,15 +393,14 @@ Answer:"""
             logger.error(f"Traceback: {traceback.format_exc()}")
             return error_msg
-# Create Gradio interface
 def create_gradio_interface():
     try:
         logger.info("Creating Gradio interface...")
         api_key = os.getenv("HF_API_KEY")
-        rag = RAGApplication(hf_api_key=api_key)
         with gr.Blocks() as demo:
-            gr.Markdown("# PDF Question Answering System")
             with gr.Row():
                 pdf_input = gr.File(
@@ -209,7 +424,7 @@ def create_gradio_interface():
                 theme="soft",
                 examples=[
                     "What is the main topic of this document?",
-                    "Can you summarize the key points?",
                     "What are the main conclusions?",
                 ],
             )

 import logging
 import traceback
 from datetime import datetime
+from typing import List, Dict, Tuple, Any
+import re
 # Configure logging
 logging.basicConfig(
     level=logging.DEBUG,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
     handlers=[
+        logging.FileHandler(f'enhanced_rag_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'),
         logging.StreamHandler()
     ]
 )
 logger = logging.getLogger(__name__)
+class TextPreprocessor:
+    @staticmethod
+    def clean_text(text: str) -> str:
+        """Clean and normalize text content."""
+        # Remove multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Remove multiple newlines
+        text = re.sub(r'\n\s*\n', '\n\n', text)
+        # Normalize quotes
+        text = re.sub(r'[""']', '"', text)
+        # Remove header/footer artifacts
+        text = re.sub(r'^.*Page \d+.*$', '', text, flags=re.MULTILINE)
+        return text.strip()
+    @staticmethod
+    def extract_section_headers(text: str) -> List[str]:
+        """Extract potential section headers from text."""
+        # Simple header detection (can be enhanced based on document structure)
+        header_pattern = r'^(?:[A-Z][A-Za-z\s]{2,50}:?|(?:\d+\.){1,3}\s+[A-Z][A-Za-z\s]{2,50})$'
+        headers = re.findall(header_pattern, text, re.MULTILINE)
+        return headers
+def create_page_chunks(pdf_reader: PdfReader) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Creates both page-level and semantic chunks from PDF content.
+    """
+    page_chunks = []
+    semantic_chunks = []
+    preprocessor = TextPreprocessor()
+    # Configure text splitters
+    semantic_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
+        length_function=len
+    )
+    # Sliding window parameters
+    window_size = 2000
+    window_overlap = 500
+    for page_num, page in enumerate(pdf_reader.pages, 1):
+        try:
+            page_text = page.extract_text()
+            if not page_text.strip():
+                continue
+            # Clean and preprocess text
+            cleaned_text = preprocessor.clean_text(page_text)
+            headers = preprocessor.extract_section_headers(cleaned_text)
+            # Store full page as a chunk
+            page_chunks.append({
+                "content": cleaned_text,
+                "metadata": {
+                    "page_num": page_num,
+                    "chunk_type": "full_page",
+                    "section_headers": headers
+                }
+            })
+            # Create semantic chunks
+            semantic_page_chunks = semantic_splitter.split_text(cleaned_text)
+            # Create sliding windows for long content
+            if len(cleaned_text) > window_size:
+                start = 0
+                while start < len(cleaned_text):
+                    window_text = cleaned_text[start:start + window_size]
+                    semantic_chunks.append({
+                        "content": window_text,
+                        "metadata": {
+                            "page_num": page_num,
+                            "chunk_type": "sliding_window",
+                            "window_start": start,
+                            "section_headers": headers
+                        }
+                    })
+                    start += (window_size - window_overlap)
+            # Add regular semantic chunks
+            for chunk_num, chunk in enumerate(semantic_page_chunks):
+                semantic_chunks.append({
+                    "content": chunk,
+                    "metadata": {
+                        "page_num": page_num,
+                        "chunk_num": chunk_num,
+                        "chunk_type": "semantic",
+                        "total_chunks": len(semantic_page_chunks),
+                        "section_headers": headers
+                    }
+                })
+        except Exception as e:
+            logger.error(f"Error processing page {page_num}: {str(e)}")
+            continue
+    return page_chunks, semantic_chunks
+class EnhancedRAGApplication:
+    def __init__(self, hf_api_key: str):
         try:
             self.hf_api_key = hf_api_key
+            self.page_store = None
+            self.semantic_store = None
+            self.sliding_store = None
             logger.info("Initializing HuggingFace embeddings...")
             self.embeddings = HuggingFaceInferenceAPIEmbeddings(
                 api_key=hf_api_key,
                 model_name="sentence-transformers/all-MiniLM-L6-v2"
             )
             logger.info("Initializing HuggingFace client...")
             self.client = InferenceClient(api_key=hf_api_key)
             self.conversation_history = []
+            # Initialize cache
+            self.chunk_cache = {}
+            self.query_cache = {}
+            logger.info("EnhancedRAGApplication initialized successfully")
         except Exception as e:
+            logger.error(f"Error initializing EnhancedRAGApplication: {str(e)}")
             logger.error(f"Traceback: {traceback.format_exc()}")
             raise
 4. Use concise language and avoid unnecessary elaboration
 5. Maintain continuity with previous conversation when relevant
+Context structure:
+- Full page chunks provide complete context
+- Semantic chunks provide focused information
+- Sliding windows maintain context across chunk boundaries
 Context: {context}
 Question: {question}
 Answer:"""
+    def process_pdf(self, file_path: str) -> str:
         try:
+            logger.info(f"Starting enhanced PDF processing for file: {file_path}")
+            if file_path is None or not os.path.exists(file_path):
+                return "Please upload a valid PDF file."
+            # Reset conversation history and caches
             self.conversation_history = []
+            self.chunk_cache = {}
+            self.query_cache = {}
             pdf_reader = PdfReader(file_path)
+            # Create chunks
+            page_chunks, semantic_chunks = create_page_chunks(pdf_reader)
+            # Create vector stores
+            logger.info("Creating vector stores...")
+            self.page_store = FAISS.from_texts(
+                [chunk["content"] for chunk in page_chunks],
+                self.embeddings,
+                metadatas=[chunk["metadata"] for chunk in page_chunks]
+            )
+            self.semantic_store = FAISS.from_texts(
+                [chunk["content"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "semantic"],
+                self.embeddings,
+                metadatas=[chunk["metadata"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "semantic"]
             )
+            self.sliding_store = FAISS.from_texts(
+                [chunk["content"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "sliding_window"],
+                self.embeddings,
+                metadatas=[chunk["metadata"] for chunk in semantic_chunks if chunk["metadata"]["chunk_type"] == "sliding_window"]
+            )
+            logger.info("Vector stores created successfully")
+            return "PDF processed successfully with enhanced chunking!"
+        except Exception as e:
+            logger.error(f"Error in enhanced PDF processing: {str(e)}")
+            return f"Error processing PDF: {str(e)}"
+    def mmr_reranking(self, results: List[Dict], lambda_param: float = 0.5, num_results: int = 3) -> List[Dict]:
+        """
+        Rerank results using Maximum Marginal Relevance to ensure diversity.
+        """
+        if len(results) <= num_results:
+            return results
+        selected = [results[0]]  # Start with highest scored result
+        remaining = results[1:]
+        while len(selected) < num_results and remaining:
+            max_mmr_score = -1
+            max_mmr_idx = -1
+            for i, result in enumerate(remaining):
+                # Calculate similarity term
+                similarity_score = result["score"]
+                # Calculate diversity term
+                diversity_scores = [1 - self._calculate_similarity(result["content"], s["content"])
+                                  for s in selected]
+                diversity_score = min(diversity_scores)
+                # Calculate MMR score
+                mmr_score = lambda_param * similarity_score + (1 - lambda_param) * diversity_score
+                if mmr_score > max_mmr_score:
+                    max_mmr_score = mmr_score
+                    max_mmr_idx = i
+            if max_mmr_idx != -1:
+                selected.append(remaining.pop(max_mmr_idx))
+            else:
+                break
+        return selected
+    def _calculate_similarity(self, text1: str, text2: str) -> float:
+        """
+        Calculate similarity between two texts using embeddings.
+        """
+        try:
+            emb1 = self.embeddings.embed_query(text1)
+            emb2 = self.embeddings.embed_query(text2)
+            return sum(a * b for a, b in zip(emb1, emb2))
+        except:
+            return 0
+    def hybrid_retrieval(self, query: str, k_semantic: int = 3, k_pages: int = 1) -> str:
+        """
+        Performs hybrid retrieval using semantic, page-level, and sliding window chunks.
+        """
+        # Check query cache
+        cache_key = f"{query}_{k_semantic}_{k_pages}"
+        if cache_key in self.query_cache:
+            return self.query_cache[cache_key]
+        results = []
+        # Get relevant semantic chunks
+        semantic_results = self.semantic_store.similarity_search_with_score(
+            query, k=k_semantic
+        )
+        # Get relevant full pages
+        page_results = self.page_store.similarity_search_with_score(
+            query, k=k_pages
+        )
+        # Get relevant sliding windows
+        sliding_results = self.sliding_store.similarity_search_with_score(
+            query, k=k_semantic
+        )
+        # Combine all results
+        all_results = []
+        for doc, score in semantic_results:
+            all_results.append({
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "score": score,
+                "type": "semantic"
+            })
+        for doc, score in page_results:
+            all_results.append({
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "score": score,
+                "type": "page"
+            })
+        for doc, score in sliding_results:
+            all_results.append({
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "score": score,
+                "type": "sliding_window"
+            })
+        # Apply MMR reranking
+        reranked_results = self.mmr_reranking(all_results)
+        # Combine context while preserving document structure
+        context = []
+        for result in reranked_results:
+            context_str = f"[Page {result['metadata']['page_num']}"
+            if result['type'] == "semantic":
+                context_str += f", Chunk {result['metadata']['chunk_num']}"
+            elif result['type'] == "sliding_window":
+                context_str += f", Window {result['metadata']['window_start']}"
+            if result['metadata'].get('section_headers'):
+                context_str += f", Section: {result['metadata']['section_headers'][0]}"
+            context_str += f"]: {result['content']}"
+            context.append(context_str)
+        final_context = "\n\n".join(context)
+        # Cache the result
+        self.query_cache[cache_key] = final_context
+        return final_context
+    def generate_response(self, message: str, history: List[Tuple[str, str]]) -> str:
         try:
             logger.info(f"Generating response for message: {message}")
+            if not any([self.page_store, self.semantic_store, self.sliding_store]):
                 return "Please upload and process a PDF first."
             query = message.strip()
             if not query:
                 return "Please enter a question."
+            # Get relevant context using hybrid retrieval
+            context = self.hybrid_retrieval(query)
             # Format conversation history
             conversation_history = "\n".join([
                 f"Q: {q}\nA: {a}" for q, a in history[-3:] if q and a
             ])
+            # Create prompt
             prompt = self.system_prompt.format(
                 context=context,
                 conversation_history=conversation_history,
             logger.error(f"Traceback: {traceback.format_exc()}")
             return error_msg
 def create_gradio_interface():
     try:
         logger.info("Creating Gradio interface...")
         api_key = os.getenv("HF_API_KEY")
+        rag = EnhancedRAGApplication(hf_api_key=api_key)
         with gr.Blocks() as demo:
+            gr.Markdown("# Enhanced PDF Question Answering System")
             with gr.Row():
                 pdf_input = gr.File(
                 theme="soft",
                 examples=[
                     "What is the main topic of this document?",
+                    "Can you summarize the key points
                     "What are the main conclusions?",
                 ],
             )