Spaces:

SyedZainAliShah
/

Enhanced_RAG_Based_Chatbot

Sleeping

App Files Files Community

SyedZainAliShah commited on Jan 10

Commit

bc268dd

verified ·

1 Parent(s): dfcf54f

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -180

app.py CHANGED Viewed

@@ -16,7 +16,6 @@ try:
     if not api_key:
         print("WARNING: GROQ_API_KEY not found in environment variables")
     else:
-        # Initialize without proxies parameter
         import httpx
         client = Groq(
             api_key=api_key,
@@ -47,7 +46,6 @@ document_store = {
 def extract_text_from_pdf(pdf_file):
     """Extract text from PDF file"""
     try:
-        # Handle both file path (string) and file object
         if isinstance(pdf_file, str):
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             filename = os.path.basename(pdf_file)
@@ -58,7 +56,7 @@ def extract_text_from_pdf(pdf_file):
         text_data = []
         for page_num, page in enumerate(pdf_reader.pages):
             text = page.extract_text()
-            if text and text.strip():  # Only add non-empty pages
                 text_data.append({
                     'text': text,
                     'page': page_num + 1,
@@ -73,7 +71,6 @@ def extract_text_from_pdf(pdf_file):
 def extract_text_from_docx(docx_file):
     """Extract text from DOCX file (Enhancement 5)"""
     try:
-        # Handle both file path and file object
         if isinstance(docx_file, str):
             doc = docx.Document(docx_file)
             filename = os.path.basename(docx_file)
@@ -103,7 +100,7 @@ def chunk_text(text_data, chunk_size=500, overlap=50):
         for i in range(0, len(words), chunk_size - overlap):
             chunk = ' '.join(words[i:i + chunk_size])
-            if len(chunk.strip()) > 50:  # Only keep meaningful chunks
                 chunks.append(chunk)
                 metadata.append({
                     'page': data['page'],
@@ -139,7 +136,6 @@ def process_files(files):
         file_summaries = []
         for file in files:
-            # Get file extension
             if isinstance(file, str):
                 file_path = file
                 file_ext = os.path.splitext(file)[1].lower()
@@ -159,7 +155,6 @@ def process_files(files):
             all_text_data.extend(text_data)
-            # Generate file summary (Enhancement 2)
             total_text = ' '.join([d['text'] for d in text_data if d['text']])
             filename = os.path.basename(file_path)
             file_summaries.append(f"- **{filename}**: {len(text_data)} pages, {len(total_text)} characters")
@@ -167,7 +162,6 @@ def process_files(files):
         if not all_text_data:
             return "[ERROR] No valid text extracted from uploaded files."
-        # Chunk and embed
         chunks, metadata = chunk_text(all_text_data)
         if not chunks:
@@ -208,11 +202,11 @@ def retrieve_relevant_chunks(query, top_k=3):
         print(f"Error retrieving chunks: {e}")
         return [], []
-def generate_answer(query, history):
-    """Generate answer using Groq LLM with RAG (Enhancement 3 - Conversational Memory)"""
     global client
-    # Try to reinitialize client if it's None
     if client is None:
         try:
             api_key = os.environ.get("GROQ_API_KEY")
@@ -224,77 +218,57 @@ def generate_answer(query, history):
                 )
                 print("Groq client reinitialized successfully")
             else:
-                return "[ERROR] Groq API client not initialized. Please set GROQ_API_KEY in your Space settings (Settings > Variables > Add 'GROQ_API_KEY')."
         except Exception as e:
             return f"[ERROR] Failed to initialize Groq client: {str(e)}"
     if not document_store['chunks']:
-        return "[WARNING] Please upload and process documents first using the 'Process Documents' button."
     try:
         # Retrieve relevant context
-        relevant_chunks, metadata = retrieve_relevant_chunks(query, top_k=3)
         if not relevant_chunks:
             return "[ERROR] No relevant information found in the documents."
-        # Build context with source references (Enhancement 4)
         context = "\n\n".join([
             f"[Source: {meta['filename']}, Page {meta['page']}]\n{chunk}"
             for chunk, meta in zip(relevant_chunks, metadata)
         ])
-        # Build messages array for Groq API
-        messages = []
-        # Add system message
-        system_prompt = """You are a helpful assistant that answers questions based on the provided document context.
-Instructions:
-- Answer based strictly on the provided context
-- If the answer isn't in the context, say so clearly
-- Be concise and accurate
-- Reference specific sources when relevant"""
-        messages.append({
-            "role": "system",
-            "content": system_prompt
-        })
-        # Add conversation history (last 3 exchanges for context)
-        if history and len(history) > 0:
-            # Get last 3 user messages (skip current one which isn't in history yet)
-            recent_history = history[-3:] if len(history) > 3 else history
-            for msg in recent_history:
-                # History format from Gradio Chatbot with type="messages"
-                if isinstance(msg, dict) and "role" in msg and "content" in msg:
-                    messages.append({
-                        "role": msg["role"],
-                        "content": msg["content"]
-                    })
         # Add current query with context
-        user_message = f"""Context from documents:
-{context}
-Question: {query}"""
         messages.append({
             "role": "user",
-            "content": user_message
         })
-        # Call Groq API with updated model
         chat_completion = client.chat.completions.create(
             messages=messages,
-            model="llama-3.1-8b-instant",  # Updated model
             temperature=0.3,
             max_tokens=1024,
         )
         answer = chat_completion.choices[0].message.content
-        # Add source references to answer (Enhancement 4)
         sources = "\n\n**Sources:**\n" + "\n".join([
             f"- {meta['filename']} (Page {meta['page']})"
             for meta in metadata
@@ -302,10 +276,10 @@ Question: {query}"""
         full_answer = answer + sources
-        # Log query (Enhancement 8)
         document_store['conversation_history'].append({
             'timestamp': datetime.now().isoformat(),
-            'query': query,
             'answer': answer,
             'sources': [f"{m['filename']}_p{m['page']}" for m in metadata]
         })
@@ -315,12 +289,10 @@ Question: {query}"""
     except Exception as e:
         error_msg = str(e)
         print(f"Error generating answer: {error_msg}")
-        if "api_key" in error_msg.lower() or "authentication" in error_msg.lower():
-            return "[ERROR] Invalid or missing GROQ_API_KEY. Please set it in your Space settings (Settings > Variables)."
-        return f"[ERROR] Failed to generate answer: {error_msg}"
 def download_chat_history():
-    """Download conversation history as JSON (Enhancement 7)"""
     if not document_store['conversation_history']:
         return None
@@ -328,136 +300,89 @@ def download_chat_history():
         history_file = "chat_history.json"
         with open(history_file, 'w', encoding='utf-8') as f:
             json.dump(document_store['conversation_history'], f, indent=2)
         return history_file
     except Exception as e:
         print(f"Error downloading history: {e}")
         return None
-def clear_history():
-    """Clear conversation history"""
-    document_store['conversation_history'] = []
-    return None, "History cleared successfully!"
 # Build Gradio Interface
-def create_demo():
-    with gr.Blocks(title="Enhanced RAG Chatbot") as demo:
-        gr.Markdown("""
-        # Enhanced RAG-Based Chatbot
-        Upload PDF/DOCX files and ask questions about their content!
-        **Features:**
-        - Multiple file support (PDF & DOCX)
-        - Semantic embeddings with sentence-transformers
-        - Document preview & summaries
-        - Conversational memory
-        - Source references with page numbers
-        - Download chat history
-        """)
-        with gr.Row():
-            with gr.Column(scale=1):
-                file_upload = gr.File(
-                    label="Upload Documents (PDF/DOCX)",
-                    file_count="multiple",
-                    file_types=[".pdf", ".docx"]
-                )
-                process_btn = gr.Button("Process Documents", variant="primary")
-                process_output = gr.Markdown(label="Processing Status")
-                gr.Markdown("### Chat History Options")
-                download_btn = gr.Button("Download History (JSON)")
-                download_file = gr.File(label="Download", visible=True)
-                clear_btn = gr.Button("Clear History")
-                clear_msg = gr.Textbox(label="Status", interactive=False, visible=False)
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(
-                    label="Conversation",
-                    height=500,
-                    type="messages"
-                )
-                query_input = gr.Textbox(
-                    label="Ask a question",
-                    placeholder="Type your question here and press Enter...",
-                    lines=2
-                )
-                submit_btn = gr.Button("Ask Question", variant="primary")
-        # Event handlers
-        process_btn.click(
-            fn=process_files,
-            inputs=[file_upload],
-            outputs=[process_output]
-        )
-        def respond(message, chat_history):
-            """Handle user message and generate response"""
-            if not message or not message.strip():
-                return chat_history
-            # Ensure chat_history is a list
-            if chat_history is None:
-                chat_history = []
-            # Generate answer
-            bot_response = generate_answer(message, chat_history)
-            # Append user message and bot response in Gradio messages format
-            chat_history.append({"role": "user", "content": message})
-            chat_history.append({"role": "assistant", "content": bot_response})
-            return chat_history
-        # Submit button and enter key
-        submit_btn.click(
-            fn=respond,
-            inputs=[query_input, chatbot],
-            outputs=[chatbot]
-        ).then(
-            lambda: "",
-            outputs=[query_input]
-        )
-        query_input.submit(
-            fn=respond,
-            inputs=[query_input, chatbot],
-            outputs=[chatbot]
-        ).then(
-            lambda: "",
-            outputs=[query_input]
-        )
-        # Download history
-        download_btn.click(
-            fn=download_chat_history,
-            outputs=[download_file]
-        )
-        # Clear history
-        clear_btn.click(
-            fn=clear_history,
-            outputs=[chatbot, clear_msg]
-        )
-        gr.Markdown("""
-        ---
-        ### How RAG Works:
-        1. **Retrieval**: Finds relevant text chunks from uploaded documents using semantic similarity
-        2. **Augmentation**: Combines retrieved context with your question
-        3. **Generation**: Uses Groq LLM to generate accurate answers based on the context
-        ### Usage Instructions:
-        1. Upload one or more PDF/DOCX files
-        2. Click "Process Documents" and wait for confirmation
-        3. Ask questions about the content
-        4. Download chat history anytime as JSON
-        """)
-    return demo
-# Launch the app
 if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch(ssr_mode=False)

     if not api_key:
         print("WARNING: GROQ_API_KEY not found in environment variables")
     else:
         import httpx
         client = Groq(
             api_key=api_key,
 def extract_text_from_pdf(pdf_file):
     """Extract text from PDF file"""
     try:
         if isinstance(pdf_file, str):
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             filename = os.path.basename(pdf_file)
         text_data = []
         for page_num, page in enumerate(pdf_reader.pages):
             text = page.extract_text()
+            if text and text.strip():
                 text_data.append({
                     'text': text,
                     'page': page_num + 1,
 def extract_text_from_docx(docx_file):
     """Extract text from DOCX file (Enhancement 5)"""
     try:
         if isinstance(docx_file, str):
             doc = docx.Document(docx_file)
             filename = os.path.basename(docx_file)
         for i in range(0, len(words), chunk_size - overlap):
             chunk = ' '.join(words[i:i + chunk_size])
+            if len(chunk.strip()) > 50:
                 chunks.append(chunk)
                 metadata.append({
                     'page': data['page'],
         file_summaries = []
         for file in files:
             if isinstance(file, str):
                 file_path = file
                 file_ext = os.path.splitext(file)[1].lower()
             all_text_data.extend(text_data)
             total_text = ' '.join([d['text'] for d in text_data if d['text']])
             filename = os.path.basename(file_path)
             file_summaries.append(f"- **{filename}**: {len(text_data)} pages, {len(total_text)} characters")
         if not all_text_data:
             return "[ERROR] No valid text extracted from uploaded files."
         chunks, metadata = chunk_text(all_text_data)
         if not chunks:
         print(f"Error retrieving chunks: {e}")
         return [], []
+def chat(message, history):
+    """Main chat function that handles RAG pipeline"""
     global client
+    # Reinitialize client if needed
     if client is None:
         try:
             api_key = os.environ.get("GROQ_API_KEY")
                 )
                 print("Groq client reinitialized successfully")
             else:
+                return "[ERROR] Groq API client not initialized. Please set GROQ_API_KEY in your Space settings."
         except Exception as e:
             return f"[ERROR] Failed to initialize Groq client: {str(e)}"
     if not document_store['chunks']:
+        return "[WARNING] Please upload and process documents first."
     try:
         # Retrieve relevant context
+        relevant_chunks, metadata = retrieve_relevant_chunks(message, top_k=3)
         if not relevant_chunks:
             return "[ERROR] No relevant information found in the documents."
+        # Build context
         context = "\n\n".join([
             f"[Source: {meta['filename']}, Page {meta['page']}]\n{chunk}"
             for chunk, meta in zip(relevant_chunks, metadata)
         ])
+        # Build messages for Groq API
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and accurate."
+            }
+        ]
+        # Add conversation history
+        if history:
+            for user_msg, bot_msg in history[-3:]:  # Last 3 exchanges
+                messages.append({"role": "user", "content": user_msg})
+                messages.append({"role": "assistant", "content": bot_msg})
         # Add current query with context
         messages.append({
             "role": "user",
+            "content": f"Context from documents:\n{context}\n\nQuestion: {message}"
         })
+        # Call Groq API
         chat_completion = client.chat.completions.create(
             messages=messages,
+            model="llama-3.1-8b-instant",
             temperature=0.3,
             max_tokens=1024,
         )
         answer = chat_completion.choices[0].message.content
+        # Add sources
         sources = "\n\n**Sources:**\n" + "\n".join([
             f"- {meta['filename']} (Page {meta['page']})"
             for meta in metadata
         full_answer = answer + sources
+        # Log query
         document_store['conversation_history'].append({
             'timestamp': datetime.now().isoformat(),
+            'query': message,
             'answer': answer,
             'sources': [f"{m['filename']}_p{m['page']}" for m in metadata]
         })
     except Exception as e:
         error_msg = str(e)
         print(f"Error generating answer: {error_msg}")
+        return f"[ERROR] {error_msg}"
 def download_chat_history():
+    """Download conversation history as JSON"""
     if not document_store['conversation_history']:
         return None
         history_file = "chat_history.json"
         with open(history_file, 'w', encoding='utf-8') as f:
             json.dump(document_store['conversation_history'], f, indent=2)
         return history_file
     except Exception as e:
         print(f"Error downloading history: {e}")
         return None
 # Build Gradio Interface
+with gr.Blocks(title="Enhanced RAG Chatbot") as demo:
+    gr.Markdown("""
+    # Enhanced RAG-Based Chatbot
+    Upload PDF/DOCX files and ask questions about their content!
+    **Features:**
+    - Multiple file support (PDF & DOCX)
+    - Semantic embeddings with sentence-transformers
+    - Document preview & summaries
+    - Conversational memory
+    - Source references with page numbers
+    - Download chat history
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_upload = gr.File(
+                label="Upload Documents (PDF/DOCX)",
+                file_count="multiple",
+                file_types=[".pdf", ".docx"]
+            )
+            process_btn = gr.Button("Process Documents", variant="primary")
+            process_output = gr.Markdown(label="Processing Status")
+            gr.Markdown("### Chat History Options")
+            download_btn = gr.Button("Download History (JSON)")
+            download_file = gr.File(label="Download", visible=True)
+            clear_btn = gr.Button("Clear Chat")
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Conversation", height=500)
+            msg = gr.Textbox(
+                label="Ask a question",
+                placeholder="Type your question here...",
+                lines=2
+            )
+            submit = gr.Button("Ask Question", variant="primary")
+    # Event handlers
+    process_btn.click(
+        fn=process_files,
+        inputs=[file_upload],
+        outputs=[process_output]
+    )
+    # Chat interactions
+    msg.submit(chat, [msg, chatbot], [chatbot]).then(
+        lambda: gr.update(value=""), None, [msg]
+    )
+    submit.click(chat, [msg, chatbot], [chatbot]).then(
+        lambda: gr.update(value=""), None, [msg]
+    )
+    # Clear chat
+    clear_btn.click(lambda: None, None, chatbot)
+    # Download history
+    download_btn.click(
+        fn=download_chat_history,
+        outputs=[download_file]
+    )
+    gr.Markdown("""
+    ---
+    ### How RAG Works:
+    1. **Retrieval**: Finds relevant text chunks from uploaded documents using semantic similarity
+    2. **Augmentation**: Combines retrieved context with your question
+    3. **Generation**: Uses Groq LLM to generate accurate answers based on the context
+    ### Usage Instructions:
+    1. Upload one or more PDF/DOCX files
+    2. Click "Process Documents" and wait for confirmation
+    3. Ask questions about the content
+    4. Download chat history anytime as JSON
+    """)
 if __name__ == "__main__":
+    demo.launch()