Spaces:

pradeepsengarr
/

Custom_Rag_Bot

Sleeping

App Files Files Community

pradeepsengarr commited on Jun 7, 2025

Commit

fd77b07

verified ·

1 Parent(s): dcc21e5

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -638

app.py CHANGED Viewed

@@ -1,704 +1,264 @@
-import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
-from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
-import PyPDF2
-import docx
-import io
-import os
-import re
-from typing import List, Optional, Dict, Tuple
-import json
-from collections import Counter
-import warnings
-warnings.filterwarnings("ignore")
-class SmartDocumentRAG:
-    def __init__(self):
-        print("🚀 Initializing Enhanced Smart RAG System...")
-        # Initialize better embedding model
-        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Faster and good quality
-        print("✅ Embedding model loaded")
-        # Initialize optimized LLM with better quantization
-        self.setup_llm()
-        # Document storage
-        self.documents = []
-        self.document_metadata = []
-        self.index = None
-        self.is_indexed = False
-        self.raw_text = ""
-        self.document_type = "general"
-        self.document_summary = ""
-        self.sentence_embeddings = []
-        self.sentences = []
-    def setup_llm(self):
-        """Setup optimized model with better quantization"""
-        try:
-            # Check CUDA availability
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"🔧 Using device: {device}")
-            if device == "cuda":
-                self.setup_gpu_model()
-            else:
-                self.setup_cpu_model()
-        except Exception as e:
-            print(f"❌ Error loading models: {e}")
-            self.setup_fallback_model()
-    def setup_gpu_model(self):
-        """Setup GPU model with proper quantization"""
-        try:
-            # Use Phi-2 - excellent for Q&A and reasoning
-            model_name = "microsoft/DialoGPT-medium"
-            # Better quantization config
-            quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_quant_storage=torch.uint8
-            )
-            try:
-                # Try Flan-T5 first - excellent for Q&A
-                model_name = "google/flan-t5-base"
-                print(f"🤖 Loading {model_name}...")
-                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    quantization_config=quantization_config,
-                    device_map="auto",
-                    torch_dtype=torch.float16,
-                    trust_remote_code=True
-                )
-                # Create pipeline for easier use
-                self.qa_pipeline = pipeline(
-                    "text2text-generation",
-                    model=self.model,
-                    tokenizer=self.tokenizer,
-                    max_length=512,
-                    do_sample=True,
-                    temperature=0.3,
-                    top_p=0.9
-                )
-                print("✅ Flan-T5 model loaded successfully")
-                self.model_type = "flan-t5"
-            except Exception as e:
-                print(f"Flan-T5 failed, trying Phi-2: {e}")
-                # Try Phi-2 as backup
-                model_name = "microsoft/phi-2"
-                print(f"🤖 Loading {model_name}...")
-                self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    quantization_config=quantization_config,
-                    device_map="auto",
-                    torch_dtype=torch.float16,
-                    trust_remote_code=True
-                )
-                if self.tokenizer.pad_token is None:
-                    self.tokenizer.pad_token = self.tokenizer.eos_token
-                print("✅ Phi-2 model loaded successfully")
-                self.model_type = "phi-2"
-        except Exception as e:
-            print(f"❌ GPU models failed: {e}")
-            self.setup_cpu_model()
-    def setup_cpu_model(self):
-        """Setup CPU-optimized model"""
-        try:
-            # Use DistilBERT for Q&A - much better than DialoGPT for this task
-            model_name = "distilbert-base-cased-distilled-squad"
-            print(f"🤖 Loading CPU model: {model_name}")
-            self.qa_pipeline = pipeline(
-                "question-answering",
-                model=model_name,
-                tokenizer=model_name
-            )
-            self.model_type = "distilbert-qa"
-            print("✅ DistilBERT Q&A model loaded successfully")
-        except Exception as e:
-            print(f"❌ CPU model failed: {e}")
-            self.setup_fallback_model()
-    def setup_fallback_model(self):
-        """Fallback to basic model"""
-        try:
-            print("🤖 Loading fallback model...")
-            self.qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
-            self.model_type = "fallback"
-            print("✅ Fallback model loaded")
-        except Exception as e:
-            print(f"❌ All models failed: {e}")
-            self.qa_pipeline = None
-            self.model_type = "none"
-    def detect_document_type(self, text: str) -> str:
-        """Enhanced document type detection"""
-        text_lower = text.lower()
-        resume_patterns = [
-            'experience', 'skills', 'education', 'linkedin', 'email', 'phone',
-            'work experience', 'employment', 'resume', 'cv', 'curriculum vitae',
-            'internship', 'projects', 'achievements', 'career', 'profile', 'objective'
-        ]
-        research_patterns = [
-            'abstract', 'introduction', 'methodology', 'conclusion', 'references',
-            'literature review', 'hypothesis', 'study', 'research', 'findings',
-            'data analysis', 'results', 'discussion', 'bibliography', 'journal'
-        ]
-        business_patterns = [
-            'company', 'revenue', 'market', 'strategy', 'business', 'financial',
-            'quarter', 'profit', 'sales', 'growth', 'investment', 'stakeholder',
-            'operations', 'management', 'corporate', 'enterprise', 'budget'
-        ]
-        technical_patterns = [
-            'implementation', 'algorithm', 'system', 'technical', 'specification',
-            'architecture', 'development', 'software', 'programming', 'api',
-            'database', 'framework', 'deployment', 'infrastructure', 'code'
-        ]
-        def count_matches(patterns, text):
-            score = 0
-            for pattern in patterns:
-                count = text.count(pattern)
-                score += count * (2 if len(pattern.split()) > 1 else 1)  # Weight phrases higher
-            return score
-        scores = {
-            'resume': count_matches(resume_patterns, text_lower),
-            'research': count_matches(research_patterns, text_lower),
-            'business': count_matches(business_patterns, text_lower),
-            'technical': count_matches(technical_patterns, text_lower)
-        }
-        max_score = max(scores.values())
-        if max_score > 5:  # Higher threshold
-            return max(scores, key=scores.get)
-        return 'general'
-    def create_document_summary(self, text: str) -> str:
-        """Enhanced document summary creation"""
-        try:
-            clean_text = re.sub(r'\s+', ' ', text).strip()
-            sentences = re.split(r'[.!?]+', clean_text)
-            sentences = [s.strip() for s in sentences if len(s.strip()) > 30]
-            if not sentences:
-                return "Document contains basic information."
-            # Use first few sentences and key information
-            if self.document_type == 'resume':
-                return self.extract_resume_summary(sentences, clean_text)
-            elif self.document_type == 'research':
-                return self.extract_research_summary(sentences)
-            elif self.document_type == 'business':
-                return self.extract_business_summary(sentences)
-            else:
-                return self.extract_general_summary(sentences)
-        except Exception as e:
-            print(f"Summary creation error: {e}")
-            return "Document summary not available."
-    def extract_resume_summary(self, sentences: List[str], full_text: str) -> str:
-        """Extract resume-specific summary with better name detection"""
-        summary_parts = []
-        # Extract name using multiple patterns
-        name = self.extract_name(full_text)
-        if name:
-            summary_parts.append(f"Resume of {name}")
-        # Extract role/title
-        role_patterns = [
-            r'(?:software|senior|junior|lead|principal)?\s*(?:engineer|developer|analyst|manager|designer|architect|consultant)',
-            r'(?:full stack|frontend|backend|data|ml|ai)\s*(?:engineer|developer)',
-            r'(?:product|project|technical)\s*manager'
-        ]
-        for sentence in sentences[:5]:
-            for pattern in role_patterns:
-                matches = re.findall(pattern, sentence.lower())
-                if matches:
-                    summary_parts.append(f"working as {matches[0].title()}")
-                    break
-        # Extract experience
-        exp_match = re.search(r'(\d+)[\+\-\s]*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)', full_text.lower())
-        if exp_match:
-            summary_parts.append(f"with {exp_match.group(1)}+ years of experience")
-        return '. '.join(summary_parts) + '.' if summary_parts else "Professional resume with career details."
-    def extract_name(self, text: str) -> str:
-        """Extract name from document using multiple strategies"""
-        # Strategy 1: Look for name patterns at the beginning
-        lines = text.split('\n')[:10]  # First 10 lines
-        for line in lines:
-            line = line.strip()
-            if len(line) < 50 and len(line) > 3:  # Likely a header line
-                # Check if it looks like a name
-                name_match = re.match(r'^([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)(?:\s|$)', line)
-                if name_match:
-                    return name_match.group(1)
-        # Strategy 2: Look for "Name:" pattern
-        name_patterns = [
-            r'(?:name|full name):\s*([A-Z][a-z]+\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
-            r'^([A-Z][a-z]+\s+[A-Z][a-z]+)(?:\s*\n|\s*email|\s*phone|\s*linkedin)',
-        ]
-        for pattern in name_patterns:
-            match = re.search(pattern, text, re.MULTILINE | re.IGNORECASE)
-            if match:
-                return match.group(1)
         return ""
-    def extract_research_summary(self, sentences: List[str]) -> str:
-        """Extract research paper summary"""
-        # Look for abstract or introduction
-        for sentence in sentences[:5]:
-            if any(word in sentence.lower() for word in ['abstract', 'study', 'research', 'paper']):
-                return sentence[:200] + ('...' if len(sentence) > 200 else '')
-        return "Research document with academic content."
-    def extract_business_summary(self, sentences: List[str]) -> str:
-        """Extract business document summary"""
-        for sentence in sentences[:3]:
-            if any(word in sentence.lower() for word in ['company', 'business', 'organization']):
-                return sentence[:200] + ('...' if len(sentence) > 200 else '')
-        return "Business document with organizational information."
-    def extract_general_summary(self, sentences: List[str]) -> str:
-        """Extract general document summary"""
-        return sentences[0][:200] + ('...' if len(sentences[0]) > 200 else '') if sentences else "General document."
-    def extract_text_from_file(self, file_path: str) -> str:
-        """Enhanced text extraction"""
-        try:
-            file_extension = os.path.splitext(file_path)[1].lower()
-            if file_extension == '.pdf':
-                return self.extract_from_pdf(file_path)
-            elif file_extension == '.docx':
-                return self.extract_from_docx(file_path)
-            elif file_extension == '.txt':
-                return self.extract_from_txt(file_path)
-            else:
-                return f"Unsupported file format: {file_extension}"
-        except Exception as e:
-            return f"Error reading file: {str(e)}"
-    def extract_from_pdf(self, file_path: str) -> str:
-        """Enhanced PDF extraction"""
-        text = ""
-        try:
-            with open(file_path, 'rb') as file:
-                pdf_reader = PyPDF2.PdfReader(file)
-                for page in pdf_reader.pages:
-                    page_text = page.extract_text()
-                    if page_text.strip():
-                        # Better text cleaning
-                        page_text = re.sub(r'\s+', ' ', page_text)
-                        page_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', page_text)  # Fix merged words
-                        text += f"{page_text}\n"
-        except Exception as e:
-            text = f"Error reading PDF: {str(e)}"
-        return text.strip()
-    def extract_from_docx(self, file_path: str) -> str:
-        """Enhanced DOCX extraction"""
-        try:
-            doc = docx.Document(file_path)
             text = ""
-            for paragraph in doc.paragraphs:
-                if paragraph.text.strip():
-                    text += paragraph.text.strip() + "\n"
-            return text.strip()
-        except Exception as e:
-            return f"Error reading DOCX: {str(e)}"
-    def extract_from_txt(self, file_path: str) -> str:
-        """Enhanced TXT extraction"""
-        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
-        for encoding in encodings:
-            try:
-                with open(file_path, 'r', encoding=encoding) as file:
-                    return file.read().strip()
-            except UnicodeDecodeError:
-                continue
-            except Exception as e:
-                return f"Error reading TXT: {str(e)}"
-        return "Error: Could not decode file"
-    def enhanced_chunk_text(self, text: str, max_chunk_size: int = 300, overlap: int = 50) -> list[str]:
-        """
-        Splits text into smaller overlapping chunks for better semantic search.
-        Args:
-            text (str): The full text to chunk.
-            max_chunk_size (int): Maximum tokens/words per chunk.
-            overlap (int): Number of words overlapping between consecutive chunks.
-        Returns:
-            list[str]: List of text chunks.
-        """
-        import re
-        # Clean and normalize whitespace
-        text = re.sub(r'\s+', ' ', text).strip()
-        words = text.split()
-        chunks = []
-        start = 0
-        text_len = len(words)
-        while start < text_len:
-            end = min(start + max_chunk_size, text_len)
-            chunk_words = words[start:end]
-            chunk = ' '.join(chunk_words)
-            chunks.append(chunk)
-            # Move start forward by chunk size minus overlap to create overlap
-            start += max_chunk_size - overlap
-        return chunks
-    def process_documents(self, files) -> str:
-        """Enhanced document processing"""
-        if not files:
-            return "❌ No files uploaded!"
-        try:
-            all_text = ""
-            processed_files = []
-            for file in files:
-                if file is None:
-                    continue
-                file_text = self.extract_text_from_file(file.name)
-                if not file_text.startswith("Error") and not file_text.startswith("Unsupported"):
-                    all_text += f"\n{file_text}"
-                    processed_files.append(os.path.basename(file.name))
-                else:
-                    return f"❌ {file_text}"
-            if not all_text.strip():
-                return "❌ No text extracted from files!"
-            # Store and analyze
-            self.raw_text = all_text
-            self.document_type = self.detect_document_type(all_text)
-            self.document_summary = self.create_document_summary(all_text)
-            # Enhanced chunking
-            chunk_data = self.enhanced_chunk_text(all_text)
-            if not chunk_data:
-                return "❌ No valid text chunks created!"
-            self.documents = [chunk['text'] for chunk in chunk_data]
-            self.document_metadata = chunk_data
-            # Create embeddings
-            print(f"📄 Creating embeddings for {len(self.documents)} chunks...")
-            embeddings = self.embedder.encode(self.documents, show_progress_bar=False)
-            # Build FAISS index
-            dimension = embeddings.shape[1]
-            self.index = faiss.IndexFlatIP(dimension)
-            # Normalize for cosine similarity
-            faiss.normalize_L2(embeddings)
-            self.index.add(embeddings.astype('float32'))
-            self.is_indexed = True
-            return f"✅ Successfully processed {len(processed_files)} files:\n" + \
-                   f"📄 Files: {', '.join(processed_files)}\n" + \
-                   f"📊 Document Type: {self.document_type.title()}\n" + \
-                   f"🔍 Created {len(self.documents)} chunks\n" + \
-                   f"📝 Summary: {self.document_summary}\n" + \
-                   f"🚀 Ready for Q&A!"
-        except Exception as e:
-            return f"❌ Error processing documents: {str(e)}"
-    def find_relevant_content(self, query: str, k: int = 3) -> str:
-        """Improved content retrieval with stricter relevance filter"""
-        if not self.is_indexed:
-            return ""
-        try:
-            # Semantic search
-            query_embedding = self.embedder.encode([query])
-            faiss.normalize_L2(query_embedding)
-            scores, indices = self.index.search(query_embedding.astype('float32'), min(k, len(self.documents)))
-            relevant_chunks = []
-            for i, idx in enumerate(indices[0]):
-                score = scores[0][i]
-                if idx < len(self.documents) and score > 0.4:  # ✅ stricter similarity filter
-                    relevant_chunks.append(self.documents[idx])
-            return ' '.join(relevant_chunks)
-        except Exception as e:
-            print(f"Error in content retrieval: {e}")
-            return ""
-    def answer_question(self, query: str) -> str:
-        """Enhanced question answering with better model usage and hallucination reduction."""
-        if not query.strip():
-            return "❓ Please ask a question!"
-        if not self.is_indexed:
-            return "📁 Please upload and process documents first!"
-        try:
-            query_lower = query.lower()
-            # Handle summary requests explicitly
-            if any(word in query_lower for word in ['summary', 'summarize', 'about', 'overview']):
-                return f"📄 **Document Summary:**\n\n{self.document_summary}"
-            # Retrieve relevant content chunks via semantic search
-            context = self.find_relevant_content(query, k=3)
-            if not context:
-                return "🔍 No relevant information found. Try rephrasing your question."
-            # If no QA pipeline, fall back to direct extraction
-            if self.qa_pipeline is None:
-                return self.extract_direct_answer(query, context)
-            try:
-                if self.model_type in ["distilbert-qa", "fallback"]:
-                    # Use extractive Q&A pipeline
-                    result = self.qa_pipeline(question=query, context=context)
-                    answer = result.get('answer', '').strip()
-                    confidence = result.get('score', 0)
-                    if confidence > 0.1 and answer:
-                        return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
-                    else:
-                        return self.extract_direct_answer(query, context)
-                elif self.model_type == "flan-t5":
-                    # Use generative model with improved prompt to reduce hallucination
-                    prompt = (
-                        f"Answer concisely and strictly based on the following context.\n\n"
-                        f"Context:\n{context}\n\n"
-                        f"Question:\n{query}\n\n"
-                        f"If the answer is not contained in the context, reply with 'Not found in document.'\n"
-                        f"Answer:"
-                    )
-                    result = self.qa_pipeline(prompt, max_length=256, num_return_sequences=1)
-                    generated_text = result[0].get('generated_text', '')
-                    answer = generated_text.replace(prompt, '').strip()
-                    if answer.lower() in ["not found in document.", "no answer", "unknown", ""]:
-                        return "🔍 Sorry, the answer was not found in the documents."
-                    else:
-                        return f"**Answer:** {answer}"
-                else:
-                    # Default fallback extraction
-                    return self.extract_direct_answer(query, context)
-            except Exception as e:
-                print(f"Model inference error: {e}")
-                return self.extract_direct_answer(query, context)
-        except Exception as e:
-            return f"❌ Error processing question: {str(e)}"
     def extract_direct_answer(self, query: str, context: str) -> str:
-        """Direct answer extraction as fallback"""
-        query_lower = query.lower()
-        # Name extraction
-        if any(word in query_lower for word in ['name', 'who is', 'who']):
             names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
             if names:
                 return f"**Name:** {names[0]}"
-        # Experience extraction
-        if any(word in query_lower for word in ['experience', 'years']):
-            exp_matches = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
-            if exp_matches:
-                return f"**Experience:** {exp_matches[0]} years"
-        # Skills extraction
-        if any(word in query_lower for word in ['skill', 'technology', 'tech']):
-            # Common tech skills
-            tech_patterns = [
-                r'\b(?:Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git)\b',
-                r'\b(?:HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b'
-            ]
-            skills = []
-            for pattern in tech_patterns:
-                skills.extend(re.findall(pattern, context, re.IGNORECASE))
             if skills:
-                return f"**Skills mentioned:** {', '.join(set(skills))}"
-        # Education extraction
-        if any(word in query_lower for word in ['education', 'degree', 'university']):
-            edu_matches = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context)
-            if edu_matches:
-                return f"**Education:** {edu_matches[0]}"
-        # Return first relevant sentence
         sentences = [s.strip() for s in context.split('.') if s.strip()]
         if sentences:
             return f"**Answer:** {sentences[0]}"
-        return "I found relevant content but couldn't extract a specific answer."
-    def clean_text(self, text: str) -> str:
-        """
-        Clean and normalize raw text by:
-        - Removing excessive whitespace
-        - Fixing merged words (camel case separation)
-        - Removing unwanted characters (optional)
-        - Lowercasing or preserving case (optional)
-        """
-        import re
-        # Replace multiple whitespace/newlines/tabs with single space
-        text = re.sub(r'\s+', ' ', text).strip()
-        # Fix merged words like 'wordAnotherWord' -> 'word Another Word'
-        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
-        # Optional: remove special characters except basic punctuation
-        # text = re.sub(r'[^a-zA-Z0-9,.!?;:\'\"()\-\s]', '', text)
-        return text
-# Initialize the system
-print("Initializing Enhanced Smart RAG System...")
-rag_system = SmartDocumentRAG()
-# Create the interface
-def create_interface():
     with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
-        **Optimized with Better Models & Quantization!**
-        **Features:**
-        - 🎯 Flan-T5 or DistilBERT for accurate Q&A
-        - ⚡ 4-bit quantization for GPU efficiency
-        - 📊 Direct answer extraction
-        - 🔍 Enhanced semantic search
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
-                    file_upload = gr.File(
-                        label="📁 Upload Documents",
-                        file_count="multiple",
-                        file_types=[".pdf", ".docx", ".txt"],
-                        height=150
-                    )
                     process_btn = gr.Button("🔄 Process Documents", variant="primary", size="lg")
                 with gr.Column():
-                    process_status = gr.Textbox(
-                        label="📋 Processing Status",
-                        lines=10,
-                        interactive=False
-                    )
-            process_btn.click(
-                fn=rag_system.process_documents,
-                inputs=[file_upload],
-                outputs=[process_status]
-            )
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
-                    question_input = gr.Textbox(
-                        label="🤔 Ask Your Question",
-                        placeholder="What is the person's name? / How many years of experience? / What skills do they have?",
-                        lines=3
-                    )
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
-                    answer_output = gr.Textbox(
-                        label="💡 Answer",
-                        lines=8,
-                        interactive=False
-                    )
-            ask_btn.click(
-                fn=rag_system.answer_question,
-                inputs=[question_input],
-                outputs=[answer_output]
-            )
-            summary_btn.click(
-                fn=lambda: rag_system.answer_question("summary"),
-                inputs=[],
-                outputs=[answer_output]
-            )
-    return demo
-# Launch the app
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True
-    )

+import re
+import os
 import faiss
 import numpy as np
+import gradio as gr
+from typing import List
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+from PyPDF2 import PdfReader
+import docx2txt
+# === Helper functions ===
+def clean_text(text: str) -> str:
+    """Clean and normalize text."""
+    text = re.sub(r'\s+', ' ', text)  # normalize whitespace
+    text = text.strip()
+    return text
+def chunk_text(text: str, max_chunk_size: int = 300, overlap: int = 50) -> List[str]:
+    """Split text into smaller overlapping chunks for better semantic search."""
+    sentences = re.split(r'(?<=[.?!])\s+', text)
+    chunks = []
+    chunk = ""
+    for sentence in sentences:
+        if len(chunk) + len(sentence) <= max_chunk_size:
+            chunk += sentence + " "
+        else:
+            chunks.append(chunk.strip())
+            chunk = sentence + " "
+    if chunk:
+        chunks.append(chunk.strip())
+    # Add overlapping between chunks to retain context
+    overlapped_chunks = []
+    for i in range(len(chunks)):
+        combined = chunks[i]
+        if i > 0:
+            combined = chunks[i-1][-overlap:] + " " + combined
+        overlapped_chunks.append(clean_text(combined))
+    return overlapped_chunks
+def extract_text_from_pdf(file_path: str) -> str:
+    """Extract text from PDF file."""
+    text = ""
+    try:
+        reader = PdfReader(file_path)
+        for page in reader.pages:
+            text += page.extract_text() + " "
+    except Exception as e:
+        print(f"Error reading PDF {file_path}: {e}")
+    return clean_text(text)
+def extract_text_from_docx(file_path: str) -> str:
+    """Extract text from DOCX file."""
+    try:
+        text = docx2txt.process(file_path)
+        return clean_text(text)
+    except Exception as e:
+        print(f"Error reading DOCX {file_path}: {e}")
         return ""
+def extract_text_from_txt(file_path: str) -> str:
+    """Extract text from TXT file."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+        return clean_text(text)
+    except Exception as e:
+        print(f"Error reading TXT {file_path}: {e}")
+        return ""
+# === Main RAG System ===
+class SmartDocumentRAG:
+    def __init__(self):
+        # Model & embedding initialization
+        self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        self.qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
+        self.documents = []
+        self.chunks = []
+        self.index = None
+        self.is_indexed = False
+        self.document_summary = ""
+    def process_documents(self, uploaded_files) -> str:
+        """Load, extract, chunk, embed, and index documents."""
+        if not uploaded_files:
+            return "⚠️ No files uploaded."
+        self.documents.clear()
+        self.chunks.clear()
+        all_text = ""
+        # Extract text from each uploaded file
+        for file_obj in uploaded_files:
+            # Save file temporarily to disk to process
+            file_path = file_obj.name
+            ext = os.path.splitext(file_path)[1].lower()
             text = ""
+            if ext == ".pdf":
+                text = extract_text_from_pdf(file_path)
+            elif ext == ".docx":
+                text = extract_text_from_docx(file_path)
+            elif ext == ".txt":
+                text = extract_text_from_txt(file_path)
+            else:
+                continue  # skip unsupported
+            if text:
+                self.documents.append(text)
+                all_text += text + " "
+        if not all_text.strip():
+            return "⚠️ No extractable text found in uploaded files."
+        # Create chunks for semantic search
+        self.chunks = chunk_text(all_text)
+        # Create embeddings for chunks
+        embeddings = self.embedder.encode(self.chunks, convert_to_numpy=True)
+        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)  # normalize
+        # Create FAISS index
+        dim = embeddings.shape[1]
+        self.index = faiss.IndexFlatIP(dim)
+        self.index.add(embeddings.astype('float32'))
+        self.is_indexed = True
+        # Create simple summary
+        self.document_summary = self.generate_summary(all_text)
+        return f"✅ Processed {len(self.documents)} document(s), {len(self.chunks)} chunks indexed."
+    def generate_summary(self, text: str) -> str:
+        """Generate a simple summary using top sentences."""
+        sentences = re.split(r'(?<=[.?!])\s+', text)
+        summary = ' '.join(sentences[:5])  # first 5 sentences as naive summary
+        return summary
+    def find_relevant_content(self, query: str, top_k: int = 3) -> str:
+        """Perform semantic search to find relevant content chunks."""
+        if not self.is_indexed or not self.chunks:
+            return ""
+        query_emb = self.embedder.encode([query], convert_to_numpy=True)
+        query_emb = query_emb / np.linalg.norm(query_emb, axis=1, keepdims=True)
+        scores, indices = self.index.search(query_emb.astype('float32'), min(top_k, len(self.chunks)))
+        relevant_chunks = []
+        for i, idx in enumerate(indices[0]):
+            if scores[0][i] > 0.1:
+                relevant_chunks.append(self.chunks[idx])
+        return " ".join(relevant_chunks)
     def extract_direct_answer(self, query: str, context: str) -> str:
+        """Simple regex-based fallback extraction."""
+        q = query.lower()
+        if any(word in q for word in ['name', 'who is', 'who']):
             names = re.findall(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context)
             if names:
                 return f"**Name:** {names[0]}"
+        if any(word in q for word in ['experience', 'years']):
+            years = re.findall(r'(\d+)[\+\-\s]*(?:years?|yrs?)', context.lower())
+            if years:
+                return f"**Experience:** {years[0]} years"
+        if any(word in q for word in ['skill', 'technology', 'tech']):
+            skills = re.findall(r'\b(?:Python|Java|JavaScript|React|Node|SQL|AWS|Docker|Kubernetes|Git|HTML|CSS|Angular|Vue|Spring|Django|Flask|MongoDB|PostgreSQL)\b', context, re.I)
             if skills:
+                unique_skills = sorted(set(skills), key=skills.index)
+                return f"**Skills:** {', '.join(unique_skills)}"
+        if any(word in q for word in ['education', 'degree', 'university']):
+            edu = re.findall(r'(?:Bachelor|Master|PhD|B\.?S\.?|M\.?S\.?|B\.?A\.?|M\.?A\.?).*?(?:in|of)\s+([^.]+)', context, re.I)
+            if edu:
+                return f"**Education:** {edu[0]}"
+        # Fallback: first sentence from context
         sentences = [s.strip() for s in context.split('.') if s.strip()]
         if sentences:
             return f"**Answer:** {sentences[0]}"
+        return "I found relevant content but could not extract a specific answer."
+    def answer_question(self, query: str) -> str:
+        if not query.strip():
+            return "❓ Please ask a question."
+        if not self.is_indexed:
+            return "📁 Please upload and process documents first."
+        q_lower = query.lower()
+        if any(word in q_lower for word in ['summary', 'summarize', 'overview', 'about']):
+            return f"📄 **Document Summary:**\n\n{self.document_summary}"
+        context = self.find_relevant_content(query, top_k=3)
+        if not context:
+            return "🔍 No relevant information found. Try rephrasing your question."
+        try:
+            # Use model for QA
+            result = self.qa_pipeline(question=query, context=context)
+            answer = result.get('answer', '').strip()
+            score = result.get('score', 0)
+            # Confidence threshold to fallback to regex extraction
+            if score < 0.1 or not answer:
+                return self.extract_direct_answer(query, context)
+            return f"**Answer:** {answer}\n\n**Context:** {context[:200]}..."
+        except Exception as e:
+            print(f"QA model error: {e}")
+            return self.extract_direct_answer(query, context)
+# === Gradio UI ===
+def main():
+    rag = SmartDocumentRAG()
+    def process_files(files):
+        return rag.process_documents(files)
+    def ask_question(question):
+        return rag.answer_question(question)
+    def get_summary():
+        return rag.answer_question("summary")
     with gr.Blocks(title="🧠 Enhanced Document Q&A", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🧠 Enhanced Document Q&A System
+        **Optimized with Better Models & Semantic Search**
+        - Upload PDF, DOCX, TXT files
+        - Semantic search + QA pipeline
+        - Direct answer extraction fallback
         """)
         with gr.Tab("📤 Upload & Process"):
             with gr.Row():
                 with gr.Column():
+                    file_upload = gr.File(label="📁 Upload Documents", file_types=['.pdf','.docx','.txt'], file_count="multiple", height=150)
                     process_btn = gr.Button("🔄 Process Documents", variant="primary", size="lg")
                 with gr.Column():
+                    process_status = gr.Textbox(label="📋 Processing Status", lines=10, interactive=False)
+            process_btn.click(fn=process_files, inputs=file_upload, outputs=process_status)
         with gr.Tab("❓ Q&A"):
             with gr.Row():
                 with gr.Column():
+                    question_input = gr.Textbox(label="🤔 Ask Your Question", lines=3,
+                        placeholder="Name? Experience? Skills? Education?")
                     with gr.Row():
                         ask_btn = gr.Button("🧠 Get Answer", variant="primary")
                         summary_btn = gr.Button("📊 Get Summary", variant="secondary")
                 with gr.Column():
+                    answer_output = gr.Textbox(label="💡 Answer", lines=8, interactive=False)
+            ask_btn.click(fn=ask_question, inputs=question_input, outputs=answer_output)
+            summary_btn.click(fn=get_summary, inputs=None, outputs=answer_output)
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 if __name__ == "__main__":
+    main()