Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Sleeping

App Files Files Community

pradeepsengarr commited on Jun 7, 2025

Commit

aee5caa

verified ·

1 Parent(s): 26dd37e

Update app.py

Browse files

Files changed (1) hide show

app.py +290 -217

app.py CHANGED Viewed

@@ -1,264 +1,337 @@
-import re
 import os
 import faiss
-import numpy as np
 import gradio as gr
-from typing import List
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
-from PyPDF2 import PdfReader
-import docx2txt
-# === Helper functions ===
-def clean_text(text: str) -> str:
-    """Clean and normalize text."""
-    text = re.sub(r'\s+', ' ', text)  # normalize whitespace
-    text = text.strip()
-    return text
-def chunk_text(text: str, max_chunk_size: int = 300, overlap: int = 50) -> List[str]:
-    """Split text into smaller overlapping chunks for better semantic search."""
-    sentences = re.split(r'(?<=[.?!])\s+', text)
-    chunks = []
-    chunk = ""
-    for sentence in sentences:
-        if len(chunk) + len(sentence) <= max_chunk_size:
-            chunk += sentence + " "
-        else:
-            chunks.append(chunk.strip())
-            chunk = sentence + " "
-    if chunk:
-        chunks.append(chunk.strip())
-    # Add overlapping between chunks to retain context
-    overlapped_chunks = []
-    for i in range(len(chunks)):
-        combined = chunks[i]
-        if i > 0:
-            combined = chunks[i-1][-overlap:] + " " + combined
-        overlapped_chunks.append(clean_text(combined))
-    return overlapped_chunks
-def extract_text_from_pdf(file_path: str) -> str:
-    """Extract text from PDF file."""
-    text = ""
-    try:
-        reader = PdfReader(file_path)
-        for page in reader.pages:
-            text += page.extract_text() + " "
-    except Exception as e:
-        print(f"Error reading PDF {file_path}: {e}")
-    return clean_text(text)
-def extract_text_from_docx(file_path: str) -> str:
-    """Extract text from DOCX file."""
-    try:
-        text = docx2txt.process(file_path)
-        return clean_text(text)
-    except Exception as e:
-        print(f"Error reading DOCX {file_path}: {e}")
-        return ""
-def extract_text_from_txt(file_path: str) -> str:
-    """Extract text from TXT file."""
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            text = f.read()
-        return clean_text(text)
-    except Exception as e:
-        print(f"Error reading TXT {file_path}: {e}")
-        return ""
-# === Main RAG System ===
 class SmartDocumentRAG:
-    def __init__(self):
-        # Model & embedding initialization
-        self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-        self.qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
         self.documents = []
-        self.chunks = []
         self.index = None
         self.is_indexed = False
-        self.document_summary = ""
-    def process_documents(self, uploaded_files) -> str:
-        """Load, extract, chunk, embed, and index documents."""
-        if not uploaded_files:
-            return "⚠️ No files uploaded."
-        self.documents.clear()
-        self.chunks.clear()
-        all_text = ""
-        # Extract text from each uploaded file
-        for file_obj in uploaded_files:
-            # Save file temporarily to disk to process
-            file_path = file_obj.name
-            ext = os.path.splitext(file_path)[1].lower()
-            text = ""
-            if ext == ".pdf":
-                text = extract_text_from_pdf(file_path)
-            elif ext == ".docx":
-                text = extract_text_from_docx(file_path)
-            elif ext == ".txt":
-                text = extract_text_from_txt(file_path)
             else:
-                continue  # skip unsupported
-            if text:
-                self.documents.append(text)
-                all_text += text + " "
-        if not all_text.strip():
-            return "⚠️ No extractable text found in uploaded files."
-        # Create chunks for semantic search
-        self.chunks = chunk_text(all_text)
-        # Create embeddings for chunks
-        embeddings = self.embedder.encode(self.chunks, convert_to_numpy=True)
-        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)  # normalize
-        # Create FAISS index
-        dim = embeddings.shape[1]
-        self.index = faiss.IndexFlatIP(dim)
-        self.index.add(embeddings.astype('float32'))
-        self.is_indexed = True
-        # Create simple summary
-        self.document_summary = self.generate_summary(all_text)
-        return f"✅ Processed {len(self.documents)} document(s), {len(self.chunks)} chunks indexed."
-    def generate_summary(self, text: str) -> str:
-        """Generate a simple summary using top sentences."""
-        sentences = re.split(r'(?<=[.?!])\s+', text)
-        summary = ' '.join(sentences[:5])  # first 5 sentences as naive summary
-        return summary
     def find_relevant_content(self, query: str, top_k: int = 3) -> str:
-        """Perform semantic search to find relevant content chunks."""
-        if not self.is_indexed or not self.chunks:
             return ""
-        query_emb = self.embedder.encode([query], convert_to_numpy=True)
-        query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)
-        scores, indices = self.index.search(query_emb.astype('float32'), min(top_k, len(self.chunks)))
-        relevant_chunks = []
-        for i, idx in enumerate(indices[0]):
-            if scores[0][i] > 0.1:
-                relevant_chunks.append(self.chunks[idx])
-        return " ".join(relevant_chunks)
     def extract_direct_answer(self, query: str, context: str) -> str:
-        """Simple regex-based fallback extraction."""
-        q = query.lower()
-        if any(word in q for word in ['name', 'who is', 'who']):
             names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
             if names:
                 return f"**Name:** {names[0]}"
-        if any(word in q for word in ['experience', 'years']):
-            years = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
-            if years:
-                return f"**Experience:** {years[0]} years"
-        if any(word in q for word in ['skill', 'technology', 'tech']):
-            skills = re.findall(r'\b(?:Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b', context, re.I)
-            if skills:
-                unique_skills = sorted(set(skills), key=skills.index)
-                return f"**Skills:** {', '.join(unique_skills)}"
-        if any(word in q for word in ['education', 'degree', 'university']):
             edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
             if edu:
                 return f"**Education:** {edu[0]}"
-        # Fallback: first sentence from context
-        sentences = [s.strip() for s in context.split('.') if s.strip()]
         if sentences:
             return f"**Answer:** {sentences[0]}"
-        return "I found relevant content but could not extract a specific answer."
-    def answer_question(self, query: str) -> str:
-        if not query.strip():
-            return "❓ Please ask a question."
-        if not self.is_indexed:
-            return "📁 Please upload and process documents first."
-        q_lower = query.lower()
-        if any(word in q_lower for word in ['summary', 'summarize', 'overview', 'about']):
-            return f"📄 **Document Summary:**\n\n{self.document_summary}"
-        context = self.find_relevant_content(query, top_k=3)
-        if not context:
-            return "🔍 No relevant information found. Try rephrasing your question."
-        try:
-            # Use model for QA
-            result = self.qa_pipeline(question=query, context=context)
-            answer = result.get('answer', '').strip()
-            score = result.get('score', 0)
-            # Confidence threshold to fallback to regex extraction
-            if score < 0.1 or not answer:
-                return self.extract_direct_answer(query, context)
-            return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
-        except Exception as e:
-            print(f"QA model error: {e}")
-            return self.extract_direct_answer(query, context)
-# === Gradio UI ===
-def main():
-    rag = SmartDocumentRAG()
-    def process_files(files):
-        return rag.process_documents(files)
-    def ask_question(question):
-        return rag.answer_question(question)
-    def get_summary():
-        return rag.answer_question("summary")
     with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
-        **Optimized with Better Models & Semantic Search**
-        - Upload PDF, DOCX, TXT files
-        - Semantic search + QA pipeline
-        - Direct answer extraction fallback
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
-                    file_upload = gr.File(label="📁 Upload Documents", file_types=['.pdf','.docx','.txt'], file_count="multiple", height=150)
-                    process_btn = gr.Button("🔄 Process Documents", variant="primary", size="lg")
                 with gr.Column():
-                    process_status = gr.Textbox(label="📋 Processing Status", lines=10, interactive=False)
-            process_btn.click(fn=process_files, inputs=file_upload, outputs=process_status)
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
-                    question_input = gr.Textbox(label="🤔 Ask Your Question", lines=3,
-                        placeholder="Name? Experience? Skills? Education?")
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
-            ask_btn.click(fn=ask_question, inputs=question_input, outputs=answer_output)
-            summary_btn.click(fn=get_summary, inputs=None, outputs=answer_output)
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 if __name__ == "__main__":
-    main()

 import os
+import re
 import faiss
+import docx
+import PyPDF2
 import gradio as gr
+import numpy as np
+from typing import List, Dict
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 class SmartDocumentRAG:
+    def __init__(self, embedder_model='sentence-transformers/all-MiniLM-L6-v2', qa_model='distilbert-base-cased-distilled-squad'):
+        # Load sentence embedding model
+        self.embedder = SentenceTransformer(embedder_model)
+        # Load Q&A pipeline model
+        self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
+        # Document and index initialization
         self.documents = []
+        self.document_metadata = []
+        self.raw_text = ""
+        self.document_summary = ""
+        self.document_type = ""
         self.index = None
         self.is_indexed = False
+        self.model_type = "distilbert-qa"  # Can add flan-t5 or others as needed
+    ####################
+    # Text Extraction
+    ####################
+    def extract_text_from_file(self, file_path: str) -> str:
+        ext = os.path.splitext(file_path)[1].lower()
+        try:
+            if ext == '.pdf':
+                return self.extract_from_pdf(file_path)
+            elif ext == '.docx':
+                return self.extract_from_docx(file_path)
+            elif ext == '.txt':
+                return self.extract_from_txt(file_path)
             else:
+                return f"Unsupported file type: {ext}"
+        except Exception as e:
+            return f"Error reading file: {e}"
+    def extract_from_pdf(self, file_path: str) -> str:
+        text = ""
+        try:
+            with open(file_path, 'rb') as f:
+                reader = PyPDF2.PdfReader(f)
+                for page in reader.pages:
+                    txt = page.extract_text() or ""
+                    cleaned = self.clean_text(txt)
+                    text += cleaned + "\n"
+            return text.strip()
+        except Exception as e:
+            return f"Error reading PDF: {e}"
+    def extract_from_docx(self, file_path: str) -> str:
+        try:
+            doc = docx.Document(file_path)
+            paragraphs = [self.clean_text(p.text) for p in doc.paragraphs if p.text.strip()]
+            return "\n".join(paragraphs)
+        except Exception as e:
+            return f"Error reading DOCX: {e}"
+    def extract_from_txt(self, file_path: str) -> str:
+        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+        for enc in encodings:
+            try:
+                with open(file_path, 'r', encoding=enc) as f:
+                    return self.clean_text(f.read())
+            except UnicodeDecodeError:
+                continue
+            except Exception as e:
+                return f"Error reading TXT: {e}"
+        return "Could not decode TXT file."
+    def clean_text(self, text: str) -> str:
+        # Normalize whitespace, fix broken words, remove weird chars
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Fix camel case merges
+        text = text.strip()
+        return text
+    ####################
+    # Document Type Detection & Summary
+    ####################
+    def detect_document_type(self, text: str) -> str:
+        lower_text = text.lower()
+        if any(k in lower_text for k in ['abstract', 'study', 'research', 'methodology']):
+            return 'research'
+        elif any(k in lower_text for k in ['company', 'business', 'organization', 'financial']):
+            return 'business'
+        else:
+            return 'general'
+    def create_document_summary(self, text: str) -> str:
+        sentences = re.split(r'(?<=[.!?]) +', text)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
+        if self.document_type == 'research':
+            return self.extract_research_summary(sentences)
+        elif self.document_type == 'business':
+            return self.extract_business_summary(sentences)
+        else:
+            return self.extract_general_summary(sentences)
+    def extract_research_summary(self, sentences: List[str]) -> str:
+        for s in sentences[:7]:
+            if any(w in s.lower() for w in ['abstract', 'study', 'research']):
+                return s[:300] + ('...' if len(s) > 300 else '')
+        return sentences[0][:300] if sentences else "Research document."
+    def extract_business_summary(self, sentences: List[str]) -> str:
+        for s in sentences[:5]:
+            if any(w in s.lower() for w in ['company', 'business', 'organization']):
+                return s[:300] + ('...' if len(s) > 300 else '')
+        return sentences[0][:300] if sentences else "Business document."
+    def extract_general_summary(self, sentences: List[str]) -> str:
+        return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '') if sentences else "General document."
+    ####################
+    # Chunking
+    ####################
+    def enhanced_chunk_text(self, text: str, chunk_size: int = 3, overlap: int = 1) -> List[Dict]:
+        if not text.strip():
+            return []
+        sentences = re.split(r'(?<=[.!?]) +', text)
+        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
+        chunks = []
+        for i in range(0, len(sentences), chunk_size - overlap):
+            chunk_sents = sentences[i:i + chunk_size]
+            if chunk_sents:
+                chunk_text = " ".join(chunk_sents)
+                chunks.append({
+                    "text": chunk_text,
+                    "sentence_indices": list(range(i, min(i + chunk_size, len(sentences)))),
+                    "doc_type": self.document_type
+                })
+        return chunks
+    ####################
+    # Processing uploaded files
+    ####################
+    def process_documents(self, files) -> str:
+        if not files:
+            return "❌ No files uploaded!"
+        try:
+            all_text = ""
+            processed_files = []
+            for file in files:
+                if file is None:
+                    continue
+                file_text = self.extract_text_from_file(file.name)
+                if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
+                    all_text += " " + file_text
+                    processed_files.append(os.path.basename(file.name))
+                else:
+                    return f"❌ {file_text}"
+            if not all_text.strip():
+                return "❌ No text extracted from files!"
+            self.raw_text = all_text.strip()
+            self.document_type = self.detect_document_type(self.raw_text)
+            self.document_summary = self.create_document_summary(self.raw_text)
+            chunks = self.enhanced_chunk_text(self.raw_text)
+            if not chunks:
+                return "❌ No valid chunks created!"
+            self.documents = [c["text"] for c in chunks]
+            self.document_metadata = chunks
+            embeddings = self.embedder.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
+            dimension = embeddings.shape[1]
+            self.index = faiss.IndexFlatIP(dimension)
+            faiss.normalize_L2(embeddings)
+            self.index.add(embeddings.astype('float32'))
+            self.is_indexed = True
+            return (f"✅ Processed {len(processed_files)} files: {', '.join(processed_files)}\n"
+                    f"📄 Document Type: {self.document_type.title()}\n"
+                    f"🔍 Created {len(self.documents)} chunks\n"
+                    f"📝 Summary: {self.document_summary}\n"
+                    f"🚀 Ready for Q&A!")
+        except Exception as e:
+            return f"❌ Error processing documents: {e}"
+    ####################
+    # Search & Answer
+    ####################
     def find_relevant_content(self, query: str, top_k: int = 3) -> str:
+        if not self.is_indexed:
             return ""
+        try:
+            query_embedding = self.embedder.encode([query], convert_to_numpy=True)
+            faiss.normalize_L2(query_embedding)
+            k = min(top_k, len(self.documents))
+            scores, indices = self.index.search(query_embedding.astype('float32'), k)
+            relevant_chunks = []
+            for score, idx in zip(scores[0], indices[0]):
+                if idx < len(self.documents) and score > 0.15:
+                    relevant_chunks.append(self.documents[idx])
+            return " ".join(relevant_chunks)
+        except Exception as e:
+            print(f"Search error: {e}")
+            return ""
+    def answer_question(self, query: str) -> str:
+        if not query.strip():
+            return "❓ Please ask a question!"
+        if not self.is_indexed:
+            return "📁 Please upload and process documents first!"
+        try:
+            lower_query = query.lower()
+            if any(k in lower_query for k in ['summary', 'summarize', 'about', 'overview']):
+                return f"📄 **Document Summary:**\n\n{self.document_summary}"
+            context = self.find_relevant_content(query, top_k=3)
+            if not context:
+                return "🔍 No relevant information found. Try rephrasing your question."
+            # Use Q&A pipeline
+            result = self.qa_pipeline(question=query, context=context)
+            answer = result.get('answer', '').strip()
+            score = result.get('score', 0.0)
+            if score < 0.15 or not answer:
+                # Fallback to direct extraction
+                return self.extract_direct_answer(query, context)
+            return f"**Answer:** {answer}\n\n**Context:** {context[:300]}..."
+        except Exception as e:
+            return f"❌ Error answering question: {e}"
     def extract_direct_answer(self, query: str, context: str) -> str:
+        lower_query = query.lower()
+        # Extract names (simple heuristic)
+        if any(k in lower_query for k in ['name', 'who is', 'who']):
             names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
             if names:
                 return f"**Name:** {names[0]}"
+        # Extract experience years
+        if any(k in lower_query for k in ['experience', 'years']):
+            exp = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
+            if exp:
+                return f"**Experience:** {exp[0]} years"
+        # Extract skills
+        if any(k in lower_query for k in ['skill', 'technology', 'tech']):
+            skills_regex = r'\b(Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
+            skills_found = list(set(re.findall(skills_regex, context, re.I)))
+            if skills_found:
+                return f"**Skills mentioned:** {', '.join(skills_found)}"
+        # Extract education
+        if any(k in lower_query for k in ['education', 'degree', 'university']):
             edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
             if edu:
                 return f"**Education:** {edu[0]}"
+        # Fallback: first sentence
+        sentences = re.split(r'(?<=[.!?]) +', context)
         if sentences:
             return f"**Answer:** {sentences[0]}"
+        return "I found relevant information but could not extract a precise answer."
+# Gradio interface creation
+def create_interface():
+    rag_system = SmartDocumentRAG()
     with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
+        **Optimized with Better Chunking, Summaries, and Reduced Hallucination**
+        **Features:**
+        - 🎯 DistilBERT Q&A pipeline for accurate answers
+        - ⚡ SentenceTransformer embeddings + FAISS semantic search
+        - 📊 Improved document summaries & chunking
+        - 🔍 Direct answer fallback for facts extraction
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
+                    file_upload = gr.File(label="📁 Upload Documents", file_types=[".pdf", ".docx", ".txt"], file_count="multiple", interactive=True)
+                    process_btn = gr.Button("🔄 Process Documents", variant="primary")
                 with gr.Column():
+                    process_status = gr.Textbox(label="📋 Processing Status", lines=8, interactive=False)
+            process_btn.click(fn=rag_system.process_documents, inputs=[file_upload], outputs=[process_status])
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
+                    question_input = gr.Textbox(label="🤔 Ask Your Question", placeholder="Enter your question here...", lines=3)
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
                     answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
+            ask_btn.click(fn=rag_system.answer_question, inputs=[question_input], outputs=[answer_output])
+            summary_btn.click(fn=lambda: rag_system.answer_question("summary"), inputs=[], outputs=[answer_output])
+    return demo
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)