Spaces:

SyedZainAliShah
/

Enhanced_RAG_Based_Chatbot

Sleeping

App Files Files Community

SyedZainAliShah commited on Jan 10

Commit

e9eb5ef

verified ·

1 Parent(s): acf338b

Create app.py

Browse files

Files changed (1) hide show

app.py +315 -0

app.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import gradio as gr
+import os
+from groq import Groq
+import PyPDF2
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+import json
+from datetime import datetime
+import docx
+# Initialize Groq client
+client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+# Initialize sentence transformer model for embeddings
+embedder = SentenceTransformer('all-MiniLM-L6-v2')
+# Global storage for documents and conversation history
+document_store = {
+    'chunks': [],
+    'embeddings': [],
+    'metadata': [],
+    'conversation_history': []
+}
+def extract_text_from_pdf(pdf_file):
+    """Extract text from PDF file"""
+    try:
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text_data = []
+        for page_num, page in enumerate(pdf_reader.pages):
+            text = page.extract_text()
+            text_data.append({
+                'text': text,
+                'page': page_num + 1,
+                'filename': os.path.basename(pdf_file.name)
+            })
+        return text_data
+    except Exception as e:
+        return [{'text': f"Error reading PDF: {str(e)}", 'page': 0, 'filename': pdf_file.name}]
+def extract_text_from_docx(docx_file):
+    """Extract text from DOCX file (Enhancement 5)"""
+    try:
+        doc = docx.Document(docx_file)
+        text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
+        return [{
+            'text': text,
+            'page': 1,
+            'filename': os.path.basename(docx_file.name)
+        }]
+    except Exception as e:
+        return [{'text': f"Error reading DOCX: {str(e)}", 'page': 0, 'filename': docx_file.name}]
+def chunk_text(text_data, chunk_size=500, overlap=50):
+    """Split text into semantic chunks with overlap (Enhancement 6)"""
+    chunks = []
+    metadata = []
+    for data in text_data:
+        text = data['text']
+        words = text.split()
+        for i in range(0, len(words), chunk_size - overlap):
+            chunk = ' '.join(words[i:i + chunk_size])
+            if len(chunk.strip()) > 50:  # Only keep meaningful chunks
+                chunks.append(chunk)
+                metadata.append({
+                    'page': data['page'],
+                    'filename': data['filename'],
+                    'chunk_id': len(chunks)
+                })
+    return chunks, metadata
+def create_embeddings(chunks):
+    """Create embeddings using sentence-transformers (Enhancement 1)"""
+    embeddings = embedder.encode(chunks)
+    return embeddings
+def process_files(files):
+    """Process uploaded files and create vector store"""
+    global document_store
+    if not files:
+        return "❌ Please upload at least one file."
+    document_store = {
+        'chunks': [],
+        'embeddings': [],
+        'metadata': [],
+        'conversation_history': []
+    }
+    all_text_data = []
+    file_summaries = []
+    for file in files:
+        file_ext = os.path.splitext(file.name)[1].lower()
+        if file_ext == '.pdf':
+            text_data = extract_text_from_pdf(file)
+        elif file_ext == '.docx':
+            text_data = extract_text_from_docx(file)
+        else:
+            continue
+        all_text_data.extend(text_data)
+        # Generate file summary (Enhancement 2)
+        total_text = ' '.join([d['text'] for d in text_data])
+        file_summaries.append(f"📄 **{os.path.basename(file.name)}** - {len(text_data)} pages, {len(total_text)} characters")
+    # Chunk and embed
+    chunks, metadata = chunk_text(all_text_data)
+    embeddings = create_embeddings(chunks)
+    document_store['chunks'] = chunks
+    document_store['embeddings'] = embeddings
+    document_store['metadata'] = metadata
+    summary = f"✅ **Processed {len(files)} file(s)**\n\n" + "\n".join(file_summaries)
+    summary += f"\n\n📊 Created {len(chunks)} text chunks for retrieval."
+    return summary
+def retrieve_relevant_chunks(query, top_k=3):
+    """Retrieve most relevant chunks using cosine similarity"""
+    if not document_store['chunks']:
+        return [], []
+    query_embedding = embedder.encode([query])
+    similarities = cosine_similarity(query_embedding, document_store['embeddings'])[0]
+    top_indices = np.argsort(similarities)[-top_k:][::-1]
+    relevant_chunks = [document_store['chunks'][i] for i in top_indices]
+    relevant_metadata = [document_store['metadata'][i] for i in top_indices]
+    return relevant_chunks, relevant_metadata
+def generate_answer(query, history):
+    """Generate answer using Groq LLM with RAG (Enhancement 3 - Conversational Memory)"""
+    if not document_store['chunks']:
+        return "⚠️ Please upload and process documents first."
+    # Retrieve relevant context
+    relevant_chunks, metadata = retrieve_relevant_chunks(query, top_k=3)
+    if not relevant_chunks:
+        return "❌ No relevant information found in the documents."
+    # Build context with source references (Enhancement 4)
+    context = "\n\n".join([
+        f"[Source: {meta['filename']}, Page {meta['page']}]\n{chunk}"
+        for chunk, meta in zip(relevant_chunks, metadata)
+    ])
+    # Build conversation history for context
+    history_context = ""
+    if history:
+        history_context = "\n".join([
+            f"User: {h[0]}\nAssistant: {h[1]}"
+            for h in history[-3:]  # Last 3 exchanges
+        ])
+    # Create prompt
+    prompt = f"""You are a helpful assistant that answers questions based on the provided document context.
+Previous conversation:
+{history_context}
+Context from documents:
+{context}
+Current question: {query}
+Instructions:
+- Answer based strictly on the provided context
+- If the answer isn't in the context, say so
+- Be concise and accurate
+- Reference specific sources when relevant
+Answer:"""
+    try:
+        # Call Groq API
+        chat_completion = client.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            model="llama3-8b-8192",
+            temperature=0.3,
+            max_tokens=1024,
+        )
+        answer = chat_completion.choices[0].message.content
+        # Add source references to answer (Enhancement 4)
+        sources = "\n\n📚 **Sources:**\n" + "\n".join([
+            f"- {meta['filename']} (Page {meta['page']})"
+            for meta in metadata
+        ])
+        full_answer = answer + sources
+        # Log query (Enhancement 8)
+        document_store['conversation_history'].append({
+            'timestamp': datetime.now().isoformat(),
+            'query': query,
+            'answer': answer,
+            'sources': [f"{m['filename']}_p{m['page']}" for m in metadata]
+        })
+        return full_answer
+    except Exception as e:
+        return f"❌ Error generating answer: {str(e)}"
+def download_chat_history():
+    """Download conversation history as JSON (Enhancement 7)"""
+    if not document_store['conversation_history']:
+        return None
+    history_file = "chat_history.json"
+    with open(history_file, 'w') as f:
+        json.dump(document_store['conversation_history'], f, indent=2)
+    return history_file
+def clear_history():
+    """Clear conversation history"""
+    document_store['conversation_history'] = []
+    return None, "🗑️ History cleared!"
+# Build Gradio Interface
+with gr.Blocks(title="Enhanced RAG Chatbot", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🤖 Enhanced RAG-Based Chatbot
+    Upload PDF/DOCX files and ask questions about their content!
+    **Features:**
+    - ✅ Multiple file support (PDF & DOCX)
+    - ✅ Semantic embeddings with sentence-transformers
+    - ✅ Document preview & summaries
+    - ✅ Conversational memory
+    - ✅ Source references with page numbers
+    - ✅ Download chat history
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_upload = gr.File(
+                label="Upload Documents (PDF/DOCX)",
+                file_count="multiple",
+                file_types=[".pdf", ".docx"]
+            )
+            process_btn = gr.Button("📂 Process Documents", variant="primary")
+            process_output = gr.Markdown(label="Processing Status")
+            gr.Markdown("### 💾 Chat History")
+            download_btn = gr.Button("⬇️ Download History")
+            download_file = gr.File(label="Download")
+            clear_btn = gr.Button("🗑️ Clear History")
+            clear_msg = gr.Textbox(label="Status", interactive=False)
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Conversation", height=500)
+            query_input = gr.Textbox(
+                label="Ask a question",
+                placeholder="Type your question here...",
+                lines=2
+            )
+            submit_btn = gr.Button("🚀 Ask", variant="primary")
+    # Event handlers
+    process_btn.click(
+        fn=process_files,
+        inputs=[file_upload],
+        outputs=[process_output]
+    )
+    submit_btn.click(
+        fn=generate_answer,
+        inputs=[query_input, chatbot],
+        outputs=[chatbot]
+    ).then(
+        lambda q, h: (h + [[q, generate_answer(q, h)]], ""),
+        inputs=[query_input, chatbot],
+        outputs=[chatbot, query_input]
+    )
+    download_btn.click(
+        fn=download_chat_history,
+        outputs=[download_file]
+    )
+    clear_btn.click(
+        fn=clear_history,
+        outputs=[chatbot, clear_msg]
+    )
+    gr.Markdown("""
+    ---
+    ### 📖 How RAG Works:
+    1. **Retrieval**: Finds relevant text chunks from uploaded documents using semantic similarity
+    2. **Augmentation**: Combines retrieved context with your question
+    3. **Generation**: Uses Groq LLM to generate accurate answers based on the context
+    """)
+if __name__ == "__main__":
+    demo.launch()