Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Sleeping

App Files Files Community

pradeepsengarr commited on Jun 7, 2025

Commit

dcc21e5

verified ·

1 Parent(s): c8716d2

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -43

app.py CHANGED Viewed

@@ -361,33 +361,38 @@ class SmartDocumentRAG:
         return "Error: Could not decode file"
-    def enhanced_chunk_text(self, text: str) -> List[Dict]:
-        """Enhanced chunking with better overlap"""
-        if not text.strip():
-            return []
         chunks = []
-        # Split into sentences
-        sentences = re.split(r'[.!?]+', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
-        self.sentences = sentences
-        # Create overlapping chunks
-        chunk_size = 4  # sentences per chunk
-        overlap = 2     # sentence overlap
-        for i in range(0, len(sentences), chunk_size - overlap):
-            chunk_sentences = sentences[i:i + chunk_size]
-            if chunk_sentences:
-                chunk_text = '. '.join(chunk_sentences) + '.'
-                chunks.append({
-                    'text': chunk_text,
-                    'sentence_indices': list(range(i, min(i + chunk_size, len(sentences)))),
-                    'doc_type': self.document_type
-                })
         return chunks
     def process_documents(self, files) -> str:
         """Enhanced document processing"""
@@ -451,7 +456,7 @@ class SmartDocumentRAG:
             return f"❌ Error processing documents: {str(e)}"
     def find_relevant_content(self, query: str, k: int = 3) -> str:
-        """Improved content retrieval"""
         if not self.is_indexed:
             return ""
@@ -464,17 +469,19 @@ class SmartDocumentRAG:
             relevant_chunks = []
             for i, idx in enumerate(indices[0]):
-                if idx < len(self.documents) and scores[0][i] > 0.1:  # Lower threshold
                     relevant_chunks.append(self.documents[idx])
             return ' '.join(relevant_chunks)
         except Exception as e:
             print(f"Error in content retrieval: {e}")
             return ""
     def answer_question(self, query: str) -> str:
-        """Enhanced question answering with better model usage"""
         if not query.strip():
             return "❓ Please ask a question!"
@@ -484,48 +491,61 @@ class SmartDocumentRAG:
         try:
             query_lower = query.lower()
-            # Handle summary requests
             if any(word in query_lower for word in ['summary', 'summarize', 'about', 'overview']):
                 return f"📄 **Document Summary:**\n\n{self.document_summary}"
-            # Get relevant content
             context = self.find_relevant_content(query, k=3)
             if not context:
                 return "🔍 No relevant information found. Try rephrasing your question."
-            # Use appropriate model for answering
             if self.qa_pipeline is None:
                 return self.extract_direct_answer(query, context)
             try:
-                if self.model_type == "distilbert-qa" or self.model_type == "fallback":
-                    # Use Q&A pipeline
                     result = self.qa_pipeline(question=query, context=context)
-                    answer = result['answer']
-                    confidence = result['score']
-                    if confidence > 0.1:  # Reasonable confidence
                         return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
                     else:
                         return self.extract_direct_answer(query, context)
                 elif self.model_type == "flan-t5":
-                    # Use text generation pipeline
-                    prompt = f"Answer the question based on the context.\nContext: {context}\nQuestion: {query}\nAnswer:"
-                    result = self.qa_pipeline(prompt, max_length=200, num_return_sequences=1)
-                    answer = result[0]['generated_text'].replace(prompt, '').strip()
-                    return f"**Answer:** {answer}"
                 else:
                     return self.extract_direct_answer(query, context)
             except Exception as e:
                 print(f"Model inference error: {e}")
                 return self.extract_direct_answer(query, context)
         except Exception as e:
             return f"❌ Error processing question: {str(e)}"
     def extract_direct_answer(self, query: str, context: str) -> str:
         """Direct answer extraction as fallback"""
@@ -570,6 +590,33 @@ class SmartDocumentRAG:
         return "I found relevant content but couldn't extract a specific answer."
 # Initialize the system
 print("Initializing Enhanced Smart RAG System...")
 rag_system = SmartDocumentRAG()

         return "Error: Could not decode file"
+    def enhanced_chunk_text(self, text: str, max_chunk_size: int = 300, overlap: int = 50) -> list[str]:
+        """
+        Splits text into smaller overlapping chunks for better semantic search.
+        Args:
+            text (str): The full text to chunk.
+            max_chunk_size (int): Maximum tokens/words per chunk.
+            overlap (int): Number of words overlapping between consecutive chunks.
+        Returns:
+            list[str]: List of text chunks.
+        """
+        import re
+        # Clean and normalize whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        words = text.split()
         chunks = []
+        start = 0
+        text_len = len(words)
+        while start < text_len:
+            end = min(start + max_chunk_size, text_len)
+            chunk_words = words[start:end]
+            chunk = ' '.join(chunk_words)
+            chunks.append(chunk)
+            # Move start forward by chunk size minus overlap to create overlap
+            start += max_chunk_size - overlap
         return chunks
     def process_documents(self, files) -> str:
         """Enhanced document processing"""
             return f"❌ Error processing documents: {str(e)}"
     def find_relevant_content(self, query: str, k: int = 3) -> str:
+        """Improved content retrieval with stricter relevance filter"""
         if not self.is_indexed:
             return ""
             relevant_chunks = []
             for i, idx in enumerate(indices[0]):
+                score = scores[0][i]
+                if idx < len(self.documents) and score > 0.4:  # ✅ stricter similarity filter
                     relevant_chunks.append(self.documents[idx])
             return ' '.join(relevant_chunks)
         except Exception as e:
             print(f"Error in content retrieval: {e}")
             return ""
     def answer_question(self, query: str) -> str:
+        """Enhanced question answering with better model usage and hallucination reduction."""
         if not query.strip():
             return "❓ Please ask a question!"
         try:
             query_lower = query.lower()
+            # Handle summary requests explicitly
             if any(word in query_lower for word in ['summary', 'summarize', 'about', 'overview']):
                 return f"📄 **Document Summary:**\n\n{self.document_summary}"
+            # Retrieve relevant content chunks via semantic search
             context = self.find_relevant_content(query, k=3)
             if not context:
                 return "🔍 No relevant information found. Try rephrasing your question."
+            # If no QA pipeline, fall back to direct extraction
             if self.qa_pipeline is None:
                 return self.extract_direct_answer(query, context)
             try:
+                if self.model_type in ["distilbert-qa", "fallback"]:
+                    # Use extractive Q&A pipeline
                     result = self.qa_pipeline(question=query, context=context)
+                    answer = result.get('answer', '').strip()
+                    confidence = result.get('score', 0)
+                    if confidence > 0.1 and answer:
                         return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
                     else:
                         return self.extract_direct_answer(query, context)
                 elif self.model_type == "flan-t5":
+                    # Use generative model with improved prompt to reduce hallucination
+                    prompt = (
+                        f"Answer concisely and strictly based on the following context.\n\n"
+                        f"Context:\n{context}\n\n"
+                        f"Question:\n{query}\n\n"
+                        f"If the answer is not contained in the context, reply with 'Not found in document.'\n"
+                        f"Answer:"
+                    )
+                    result = self.qa_pipeline(prompt, max_length=256, num_return_sequences=1)
+                    generated_text = result[0].get('generated_text', '')
+                    answer = generated_text.replace(prompt, '').strip()
+                    if answer.lower() in ["not found in document.", "no answer", "unknown", ""]:
+                        return "🔍 Sorry, the answer was not found in the documents."
+                    else:
+                        return f"**Answer:** {answer}"
                 else:
+                    # Default fallback extraction
                     return self.extract_direct_answer(query, context)
             except Exception as e:
                 print(f"Model inference error: {e}")
                 return self.extract_direct_answer(query, context)
         except Exception as e:
             return f"❌ Error processing question: {str(e)}"
     def extract_direct_answer(self, query: str, context: str) -> str:
         """Direct answer extraction as fallback"""
         return "I found relevant content but couldn't extract a specific answer."
+    def clean_text(self, text: str) -> str:
+        """
+        Clean and normalize raw text by:
+        - Removing excessive whitespace
+        - Fixing merged words (camel case separation)
+        - Removing unwanted characters (optional)
+        - Lowercasing or preserving case (optional)
+        """
+        import re
+        # Replace multiple whitespace/newlines/tabs with single space
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Fix merged words like 'wordAnotherWord' -> 'word Another Word'
+        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
+        # Optional: remove special characters except basic punctuation
+        # text = re.sub(r'[^a-zA-Z0-9,.!?;:\'\"()\-\s]', '', text)
+        return text
 # Initialize the system
 print("Initializing Enhanced Smart RAG System...")
 rag_system = SmartDocumentRAG()