Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Sleeping

App Files Files Community

pradeepsengarr commited on Jun 7, 2025

Commit

3d46a83

verified ·

1 Parent(s): 48c1ca7

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -290

app.py CHANGED Viewed

@@ -1,265 +1,158 @@
 import os
 import re
 import faiss
-import docx
-import PyPDF2
 import gradio as gr
 import numpy as np
-from typing import List, Dict
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 class SmartDocumentRAG:
-    def __init__(self, embedder_model='sentence-transformers/all-MiniLM-L6-v2', qa_model='distilbert-base-cased-distilled-squad'):
-        # Load sentence embedding model
-        self.embedder = SentenceTransformer(embedder_model)
-        # Load Q&A pipeline model
-        self.qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_model)
-        # Document and index initialization
         self.documents = []
-        self.document_metadata = []
-        self.raw_text = ""
-        self.document_summary = ""
-        self.document_type = ""
         self.index = None
         self.is_indexed = False
-        self.model_type = "distilbert-qa"  # Can add flan-t5 or others as needed
-    ####################
-    # Text Extraction
-    ####################
-    def extract_text_from_file(self, file_path: str) -> str:
-        ext = os.path.splitext(file_path)[1].lower()
-        try:
-            if ext == '.pdf':
-                return self.extract_from_pdf(file_path)
-            elif ext == '.docx':
-                return self.extract_from_docx(file_path)
-            elif ext == '.txt':
-                return self.extract_from_txt(file_path)
-            else:
-                return f"Unsupported file type: {ext}"
-        except Exception as e:
-            return f"Error reading file: {e}"
-    def extract_from_pdf(self, file_path: str) -> str:
-        text = ""
-        try:
-            with open(file_path, 'rb') as f:
-                reader = PyPDF2.PdfReader(f)
-                for page in reader.pages:
-                    txt = page.extract_text() or ""
-                    cleaned = self.clean_text(txt)
-                    text += cleaned + "\n"
-            return text.strip()
-        except Exception as e:
-            return f"Error reading PDF: {e}"
-    def extract_from_docx(self, file_path: str) -> str:
-        try:
-            doc = docx.Document(file_path)
-            paragraphs = [self.clean_text(p.text) for p in doc.paragraphs if p.text.strip()]
-            return "\n".join(paragraphs)
-        except Exception as e:
-            return f"Error reading DOCX: {e}"
-    def extract_from_txt(self, file_path: str) -> str:
-        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
-        for enc in encodings:
-            try:
-                with open(file_path, 'r', encoding=enc) as f:
-                    return self.clean_text(f.read())
-            except UnicodeDecodeError:
-                continue
-            except Exception as e:
-                return f"Error reading TXT: {e}"
-        return "Could not decode TXT file."
-    def clean_text(self, text: str) -> str:
-        # Normalize whitespace, fix broken words, remove weird chars
-        text = re.sub(r'\s+', ' ', text)
-        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Fix camel case merges
-        text = text.strip()
-        return text
-    ####################
-    # Document Type Detection & Summary
-    ####################
-    def detect_document_type(self, text: str) -> str:
-        lower_text = text.lower()
-        if any(k in lower_text for k in ['abstract', 'study', 'research', 'methodology']):
-            return 'research'
-        elif any(k in lower_text for k in ['company', 'business', 'organization', 'financial']):
-            return 'business'
         else:
-            return 'general'
-    def create_document_summary(self, text: str) -> str:
-        sentences = re.split(r'(?<=[.!?]) +', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
-        if self.document_type == 'research':
-            return self.extract_research_summary(sentences)
-        elif self.document_type == 'business':
-            return self.extract_business_summary(sentences)
-        else:
-            return self.extract_general_summary(sentences)
-    def extract_research_summary(self, sentences: List[str]) -> str:
-        for s in sentences[:7]:
-            if any(w in s.lower() for w in ['abstract', 'study', 'research']):
-                return s[:300] + ('...' if len(s) > 300 else '')
-        return sentences[0][:300] if sentences else "Research document."
-    def extract_business_summary(self, sentences: List[str]) -> str:
-        for s in sentences[:5]:
-            if any(w in s.lower() for w in ['company', 'business', 'organization']):
-                return s[:300] + ('...' if len(s) > 300 else '')
-        return sentences[0][:300] if sentences else "Business document."
-    def extract_general_summary(self, sentences: List[str]) -> str:
-        return sentences[0][:300] + ('...' if len(sentences[0]) > 300 else '') if sentences else "General document."
-    ####################
-    # Chunking
-    ####################
-    def enhanced_chunk_text(self, text: str, chunk_size: int = 3, overlap: int = 1) -> List[Dict]:
-        if not text.strip():
-            return []
-        sentences = re.split(r'(?<=[.!?]) +', text)
-        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
-        chunks = []
-        for i in range(0, len(sentences), chunk_size - overlap):
-            chunk_sents = sentences[i:i + chunk_size]
-            if chunk_sents:
-                chunk_text = " ".join(chunk_sents)
-                chunks.append({
-                    "text": chunk_text,
-                    "sentence_indices": list(range(i, min(i + chunk_size, len(sentences)))),
-                    "doc_type": self.document_type
-                })
-        return chunks
-    ####################
-    # Processing uploaded files
-    ####################
-    def process_documents(self, files) -> str:
         if not files:
-            return "❌ No files uploaded!"
-        try:
-            all_text = ""
-            processed_files = []
-            for file in files:
-                if file is None:
-                    continue
-                file_text = self.extract_text_from_file(file.name)
-                if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
-                    all_text += " " + file_text
-                    processed_files.append(os.path.basename(file.name))
-                else:
-                    return f"❌ {file_text}"
-            if not all_text.strip():
-                return "❌ No text extracted from files!"
-            self.raw_text = all_text.strip()
-            self.document_type = self.detect_document_type(self.raw_text)
-            self.document_summary = self.create_document_summary(self.raw_text)
-            chunks = self.enhanced_chunk_text(self.raw_text)
-            if not chunks:
-                return "❌ No valid chunks created!"
-            self.documents = [c["text"] for c in chunks]
-            self.document_metadata = chunks
-            embeddings = self.embedder.encode(self.documents, show_progress_bar=False, convert_to_numpy=True)
-            dimension = embeddings.shape[1]
-            self.index = faiss.IndexFlatIP(dimension)
-            faiss.normalize_L2(embeddings)
-            self.index.add(embeddings.astype('float32'))
-            self.is_indexed = True
-            return (f"✅ Processed {len(processed_files)} files: {', '.join(processed_files)}\n"
-                    f"📄 Document Type: {self.document_type.title()}\n"
-                    f"🔍 Created {len(self.documents)} chunks\n"
-                    f"📝 Summary: {self.document_summary}\n"
-                    f"🚀 Ready for Q&A!")
-        except Exception as e:
-            return f"❌ Error processing documents: {e}"
-    ####################
-    # Search & Answer
-    ####################
-    def find_relevant_content(self, query: str, top_k: int = 3) -> str:
         if not self.is_indexed:
             return ""
-        try:
-            query_embedding = self.embedder.encode([query], convert_to_numpy=True)
-            faiss.normalize_L2(query_embedding)
-            k = min(top_k, len(self.documents))
-            scores, indices = self.index.search(query_embedding.astype('float32'), k)
-            relevant_chunks = []
-            for score, idx in zip(scores[0], indices[0]):
-                if idx < len(self.documents) and score > 0.15:
-                    relevant_chunks.append(self.documents[idx])
-            return " ".join(relevant_chunks)
-        except Exception as e:
-            print(f"Search error: {e}")
-            return ""
     def answer_question(self, query: str) -> str:
         if not query.strip():
             return "❓ Please ask a valid question."
         if not self.is_indexed:
-            return "📁 Please upload and process documents before asking questions."
         query_lower = query.lower()
         if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
-            if self.document_summary:
-                return f"📄 Document Summary:\n\n{self.document_summary}"
-            else:
-                return "⚠️ Summary not available. Please process documents first."
         context = self.find_relevant_content(query, k=5)
-        print(f"Context found (top 5 chunks): {context}")
         if not context:
-            return "🔍 Sorry, no relevant information was found for your question. Try rephrasing."
         try:
-            if self.model_type in ["distilbert-qa", "fallback"]:
                 result = self.qa_pipeline(question=query, context=context)
-                print(f"QA Pipeline output: {result}")
                 answer = result.get('answer', '').strip()
                 score = result.get('score', 0.0)
                 if not answer or score < 0.05:
-                    return "🤔 I couldn't find a confident answer to your question based on the documents."
                 snippet = context[:300].strip()
                 if len(context) > 300:
                     snippet += "..."
                 return f"**Answer:** {answer}\n\n*Context snippet:* {snippet}"
             elif self.model_type == "flan-t5":
                 prompt = (
                     f"Answer the question based on the context below.\n\n"
@@ -267,98 +160,57 @@ class SmartDocumentRAG:
                     f"Question: {query}\nAnswer:"
                 )
                 result = self.qa_pipeline(prompt, max_length=200, num_return_sequences=1)
-                print(f"Generative pipeline output: {result}")
                 answer = result[0]['generated_text'].replace(prompt, '').strip()
                 if not answer:
-                    return "🤔 I couldn't find a confident answer to your question based on the documents."
                 return f"**Answer:** {answer}"
             else:
-                return "⚠️ Unsupported model type for QA."
         except Exception as e:
-            return f"❌ An error occurred while answering your question: {str(e)}"
-    def extract_direct_answer(self, query: str, context: str) -> str:
-        lower_query = query.lower()
-        # Extract names (simple heuristic)
-        if any(k in lower_query for k in ['name', 'who is', 'who']):
-            names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
-            if names:
-                return f"**Name:** {names[0]}"
-        # Extract experience years
-        if any(k in lower_query for k in ['experience', 'years']):
-            exp = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
-            if exp:
-                return f"**Experience:** {exp[0]} years"
-        # Extract skills
-        if any(k in lower_query for k in ['skill', 'technology', 'tech']):
-            skills_regex = r'\b(Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
-            skills_found = list(set(re.findall(skills_regex, context, re.I)))
-            if skills_found:
-                return f"**Skills mentioned:** {', '.join(skills_found)}"
-        # Extract education
-        if any(k in lower_query for k in ['education', 'degree', 'university']):
-            edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
-            if edu:
-                return f"**Education:** {edu[0]}"
-        # Fallback: first sentence
-        sentences = re.split(r'(?<=[.!?]) +', context)
-        if sentences:
-            return f"**Answer:** {sentences[0]}"
-        return "I found relevant information but could not extract a precise answer."
-# Gradio interface creation
-def create_interface():
-    rag_system = SmartDocumentRAG()
-    with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # 🧠 Enhanced Document Q&A System
-        **Optimized with Better Chunking, Summaries, and Reduced Hallucination**
-        **Features:**
-        - 🎯 DistilBERT Q&A pipeline for accurate answers
-        - ⚡ SentenceTransformer embeddings + FAISS semantic search
-        - 📊 Improved document summaries & chunking
-        - 🔍 Direct answer fallback for facts extraction
-        """)
-        with gr.Tab("📤 Upload & Process"):
-            with gr.Row():
-                with gr.Column():
-                    file_upload = gr.File(label="📁 Upload Documents", file_types=[".pdf", ".docx", ".txt"], file_count="multiple", interactive=True)
-                    process_btn = gr.Button("🔄 Process Documents", variant="primary")
-                with gr.Column():
-                    process_status = gr.Textbox(label="📋 Processing Status", lines=8, interactive=False)
-            process_btn.click(fn=rag_system.process_documents, inputs=[file_upload], outputs=[process_status])
-        with gr.Tab("❓ Q&A"):
-            with gr.Row():
-                with gr.Column():
-                    question_input = gr.Textbox(label="🤔 Ask Your Question", placeholder="Enter your question here...", lines=3)
-                    with gr.Row():
-                        ask_btn = gr.Button("🧠 Get Answer", variant="primary")
-                        summary_btn = gr.Button("📊 Get Summary", variant="secondary")
-                with gr.Column():
-                    answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
-            ask_btn.click(fn=rag_system.answer_question, inputs=[question_input], outputs=[answer_output])
-            summary_btn.click(fn=lambda: rag_system.answer_question("summary"), inputs=[], outputs=[answer_output])
     return demo
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import os
 import re
 import faiss
 import gradio as gr
 import numpy as np
+import pdfplumber
+import docx
+from typing import List, Optional
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
+# Utility: Clean text helper
+def clean_text(text: str) -> str:
+    text = re.sub(r'\s+', ' ', text)  # collapse whitespace
+    text = text.strip()
+    return text
+# Text chunking (smaller chunks for better semantic search)
+def chunk_text(text: str, chunk_size: int = 300, overlap: int = 50) -> List[str]:
+    words = text.split()
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = min(start + chunk_size, len(words))
+        chunk = ' '.join(words[start:end])
+        chunks.append(clean_text(chunk))
+        start += chunk_size - overlap
+    return chunks
+# Document loader for txt, pdf, docx
+def load_document(file_path: str) -> str:
+    ext = os.path.splitext(file_path)[1].lower()
+    text = ""
+    if ext == ".txt":
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+    elif ext == ".pdf":
+        with pdfplumber.open(file_path) as pdf:
+            pages = [page.extract_text() for page in pdf.pages if page.extract_text()]
+            text = "\n".join(pages)
+    elif ext == ".docx":
+        doc = docx.Document(file_path)
+        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
+        text = "\n".join(paragraphs)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+    return clean_text(text)
 class SmartDocumentRAG:
+    def __init__(self):
+        print("Loading embedder and models...")
+        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')  # small, fast
         self.documents = []
+        self.embeddings = None
         self.index = None
         self.is_indexed = False
+        # Load QA pipelines
+        self.model_type = "distilbert-qa"  # change to "flan-t5" for generative
+        if self.model_type == "distilbert-qa":
+            self.qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
+        elif self.model_type == "flan-t5":
+            self.qa_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")
         else:
+            self.qa_pipeline = None
+        self.document_summary = ""
+    def process_documents(self, files: List[gr.File]) -> str:
         if not files:
+            return "⚠️ No files uploaded."
+        print(f"Processing {len(files)} files...")
+        all_text = ""
+        for file in files:
+            try:
+                # gr.File is a dict-like, get 'name' key for path
+                path = file.name if hasattr(file, 'name') else file
+                text = load_document(path)
+                all_text += text + "\n"
+            except Exception as e:
+                print(f"Error loading {file}: {e}")
+        all_text = clean_text(all_text)
+        chunks = chunk_text(all_text)
+        if not chunks:
+            return "⚠️ No text extracted from documents."
+        self.documents = chunks
+        print(f"Created {len(chunks)} text chunks.")
+        # Embed and build FAISS index
+        self.embeddings = self.embedder.encode(self.documents, convert_to_numpy=True)
+        dimension = self.embeddings.shape[1]
+        self.index = faiss.IndexFlatIP(dimension)  # Cosine similarity with normalized vectors
+        faiss.normalize_L2(self.embeddings)
+        self.index.add(self.embeddings)
+        self.is_indexed = True
+        # Generate summary (simple: first 3 chunks joined)
+        summary_text = " ".join(self.documents[:3])
+        self.document_summary = summary_text if summary_text else "Summary not available."
+        return f"✅ Processed {len(files)} files and created index with {len(chunks)} chunks."
+    def find_relevant_content(self, query: str, k: int = 5) -> str:
         if not self.is_indexed:
             return ""
+        query_emb = self.embedder.encode([query], convert_to_numpy=True)
+        faiss.normalize_L2(query_emb)
+        k = min(k, len(self.documents))
+        distances, indices = self.index.search(query_emb, k)
+        relevant_chunks = []
+        for dist, idx in zip(distances[0], indices[0]):
+            if dist > 0.1 and idx < len(self.documents):
+                relevant_chunks.append(self.documents[idx])
+        context = " ".join(relevant_chunks)
+        print(f"Found {len(relevant_chunks)} relevant chunks with distances >0.1")
+        return context
     def answer_question(self, query: str) -> str:
         if not query.strip():
             return "❓ Please ask a valid question."
         if not self.is_indexed:
+            return "📁 Please upload and process documents first."
         query_lower = query.lower()
         if any(word in query_lower for word in ['summary', 'summarize', 'overview', 'about']):
+            return f"📄 Document Summary:\n\n{self.document_summary}"
         context = self.find_relevant_content(query, k=5)
+        print(f"Context for query: {context[:500]}...")
         if not context:
+            return "🔍 Sorry, no relevant information found. Try rephrasing your question."
         try:
+            if self.model_type == "distilbert-qa":
                 result = self.qa_pipeline(question=query, context=context)
+                print(f"QA pipeline result: {result}")
                 answer = result.get('answer', '').strip()
                 score = result.get('score', 0.0)
                 if not answer or score < 0.05:
+                    return "🤔 I couldn't find a confident answer based on the documents."
                 snippet = context[:300].strip()
                 if len(context) > 300:
                     snippet += "..."
                 return f"**Answer:** {answer}\n\n*Context snippet:* {snippet}"
             elif self.model_type == "flan-t5":
                 prompt = (
                     f"Answer the question based on the context below.\n\n"
                     f"Question: {query}\nAnswer:"
                 )
                 result = self.qa_pipeline(prompt, max_length=200, num_return_sequences=1)
+                print(f"Generative pipeline result: {result}")
                 answer = result[0]['generated_text'].replace(prompt, '').strip()
                 if not answer:
+                    return "🤔 I couldn't find a confident answer based on the documents."
                 return f"**Answer:** {answer}"
             else:
+                return "⚠️ Unsupported model type."
         except Exception as e:
+            print(f"Exception in answer_question: {e}")
+            return f"❌ Error: {str(e)}"
+# Create Gradio UI
+def create_interface():
+    rag = SmartDocumentRAG()
+    with gr.Blocks(title="🧠 Enhanced Document Q&A") as demo:
+        gr.Markdown(
+            """
+            # 🧠 Enhanced Document Q&A System
+            **Features:**
+            - Semantic search with FAISS + SentenceTransformer
+            - Supports PDF, DOCX, TXT uploads
+            - Uses DistilBERT or Flan-T5 for Q&A
+            - Shows answer with context snippet
+            """
+        )
+        with gr.Tab("Upload & Process"):
+            file_upload = gr.File(file_types=['.pdf', '.docx', '.txt'], label="Upload Documents", file_count="multiple")
+            process_btn = gr.Button("Process Documents")
+            process_status = gr.Textbox(label="Processing Status", interactive=False, lines=4)
+            process_btn.click(fn=rag.process_documents, inputs=[file_upload], outputs=[process_status])
+        with gr.Tab("Q&A"):
+            question_input = gr.Textbox(label="Ask your question", lines=2, placeholder="Type your question here...")
+            ask_btn = gr.Button("Get Answer")
+            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
+            ask_btn.click(fn=rag.answer_question, inputs=[question_input], outputs=[answer_output])
+        with gr.Tab("Summary"):
+            summary_btn = gr.Button("Get Document Summary")
+            summary_output = gr.Textbox(label="Summary", lines=6, interactive=False)
+            summary_btn.click(fn=lambda: rag.answer_question("summary"), inputs=[], outputs=[summary_output])
     return demo
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(server_name="0.0.0.0", server_port=7860, share=True)