Spaces:

prernajeet01
/

Reasoning_AI_Agent

Sleeping

App Files Files Community

prernajeet01 commited on Feb 25, 2025

Commit

898186b

verified ·

1 Parent(s): 8079a62

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -106

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import tempfile
 import pandas as pd
 import boto3
-from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredPowerPointLoader, UnstructuredExcelLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS
@@ -11,9 +11,18 @@ from langchain.chains import RetrievalQA
 from langchain_community.chat_models import BedrockChat
 from langchain_openai import ChatOpenAI
 from langchain_community.llms import Ollama
 import logging
-from huggingface_hub import HfApi
-from huggingface_hub.utils import RepositoryNotFoundError
 # Set up logging
 logging.basicConfig(
@@ -48,11 +57,20 @@ class AuditAgent:
         self.provider = provider
         self.document_store = None
         # Get API keys
         api_keys = get_api_keys()
         if api_keys["status"] == "error":
             raise ValueError(api_keys["message"])
         if provider == "bedrock":
             # Initialize AWS Bedrock client
             try:
@@ -117,58 +135,116 @@ class AuditAgent:
         except Exception as e:
             return f"Error processing query: {str(e)}"
-    def process_documents(self, file_path, file_name):
-        """Process uploaded documents and create a vector store."""
-        if not file_path or not file_name:
-            return "Please upload a file"
-        try:
-            documents = []
-            # Get file extension and check it's supported
-            file_ext = os.path.splitext(file_name.lower())[1]
-            supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']
-            if file_ext not in supported_exts:
-                return f"Unsupported file type: {file_ext}. Please upload one of: {', '.join(supported_exts)}"
-            # Select appropriate loader
             try:
-                if file_ext == '.pdf':
-                    loader = PyPDFLoader(file_path)
-                elif file_ext == '.docx':
-                    loader = Docx2txtLoader(file_path)
-                elif file_ext == '.pptx':
-                    loader = UnstructuredPowerPointLoader(file_path)
-                elif file_ext in ['.xlsx', '.xls']:
-                    loader = UnstructuredExcelLoader(file_path)
-                # Load and process document
-                documents.extend(loader.load())
-            except Exception as e:
-                return f"Error loading document content: {str(e)}"
-            # Split documents
-            if not documents:
-                return "No content could be extracted from the document."
-            text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=1000,
-                chunk_overlap=200
-            )
-            splits = text_splitter.split_documents(documents)
-            if not splits:
-                return "Document was processed but no text content was found."
-            # Create vector store
-            api_keys = get_api_keys()
-            embeddings = OpenAIEmbeddings(openai_api_key=api_keys["openai_key"])
-            self.document_store = FAISS.from_documents(splits, embeddings)
-            return f"Document '{file_name}' processed successfully with {len(splits)} text chunks."
-        except Exception as e:
-            return f"Error processing document: {str(e)}"
     def query_documents(self, query):
         """Query the processed documents."""
@@ -252,16 +328,6 @@ def create_interface():
         # Status indicator for initialization and operations
         status_message = gr.Textbox(label="Status", value="Ready")
-        with gr.Row():
-            with gr.Column(scale=1):
-                # Updated file upload component - using file type instead of binary
-                file_upload = gr.File(
-                    label="Upload Audit Documents",
-                    file_types=["pdf", "docx", "pptx", "xlsx", "xls"],
-                    type="filepath"  # Changed from "binary" to "filepath"
-                )
-                gr.Markdown("Supported formats: PDF, DOCX, PPTX, XLSX, XLS")
         # Use tabs for model selection instead of dropdown
         with gr.Tabs() as model_tabs:
             model_tab_dict = {}
@@ -270,14 +336,16 @@ def create_interface():
                     model_tab_dict[model_id] = tab
         with gr.Tabs() as feature_tabs:
-            with gr.Tab("💬 General Chat"):
                 chat_input = gr.Textbox(
-                    lines=3,
                     label="Ask your audit question",
                     placeholder="Enter your question here..."
                 )
                 chat_button = gr.Button("Send")
-                chat_output = gr.Markdown(label="Response")
             with gr.Tab("🔢 Numerical Problem"):
                 problem_input = gr.Textbox(
@@ -288,7 +356,20 @@ def create_interface():
                 solve_button = gr.Button("Solve")
                 solution_output = gr.Markdown(label="Solution")
-            with gr.Tab("📑 Document Query"):
                 query_input = gr.Textbox(
                     lines=3,
                     label="Query Documents",
@@ -331,29 +412,39 @@ def create_interface():
                 error_message = f"Error initializing {model_name}: {str(e)}"
                 logging.error(error_message)
                 return None, error_message
-        # Handle chat separately
-        def handle_chat(query, model_name):
-            # First update status message
-            status = f"Processing query with {model_name}..."
             # Get or initialize agent
             agent, init_status = get_or_initialize_agent(model_name)
             # If initialization failed
             if agent is None:
-                return f"Could not initialize {model_name}. {init_status}", init_status
             # Process the query
             try:
-                result = agent.process_query(query)
-                return result, f"Query processed with {model_name}"
             except Exception as e:
-                error_msg = f"Error processing query: {str(e)}"
-                return error_msg, error_msg
         # Handle numerical problem
         def handle_problem(problem, model_name):
             status = f"Solving problem with {model_name}..."
             # Get or initialize agent
@@ -371,39 +462,46 @@ def create_interface():
                 error_msg = f"Error solving problem: {str(e)}"
                 return error_msg, error_msg
-        # Updated file upload handler for filepath type
-        def handle_file_upload(file_path, model_name):
-            if file_path is None:
-                return "No file uploaded. Please upload a file."
-            try:
-                # Extract the filename from the path
-                file_name = os.path.basename(file_path)
-                # Check file extension
-                file_ext = os.path.splitext(file_name.lower())[1]
-                supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']
-                if file_ext not in supported_exts:
-                    return f"Invalid file type: {file_ext}. Please upload a file with one of these extensions: {', '.join(supported_exts)}"
-                status = f"Processing document with {model_name}..."
-                # Get or initialize agent
-                agent, init_status = get_or_initialize_agent(model_name)
-                # If initialization failed
-                if agent is None:
-                    return init_status
-                # Process the document
-                result = agent.process_documents(file_path, file_name)
-                return result
             except Exception as e:
-                return f"Error processing document: {str(e)}"
         # Handle document query
         def handle_query(query, model_name):
             status = f"Querying documents with {model_name}..."
             # Get or initialize agent
@@ -423,9 +521,14 @@ def create_interface():
         # Set up event handlers
         chat_button.click(
-            handle_chat,
-            inputs=[chat_input, selected_model],
-            outputs=[chat_output, status_message]
         )
         solve_button.click(
@@ -434,10 +537,10 @@ def create_interface():
             outputs=[solution_output, status_message]
         )
-        file_upload.upload(
             handle_file_upload,
             inputs=[file_upload, selected_model],
-            outputs=[status_message]
         )
         query_button.click(

 import tempfile
 import pandas as pd
 import boto3
+from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredPowerPointLoader, UnstructuredExcelLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_community.chat_models import BedrockChat
 from langchain_openai import ChatOpenAI
 from langchain_community.llms import Ollama
+from langchain.schema import Document
+from pathlib import Path
+from typing import List, Union
 import logging
+# Optional OCR support
+try:
+    from pdf2image import convert_from_path
+    import pytesseract
+    OCR_AVAILABLE = True
+except ImportError:
+    OCR_AVAILABLE = False
 # Set up logging
 logging.basicConfig(
         self.provider = provider
         self.document_store = None
+        # Initialize text splitter
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200
+        )
         # Get API keys
         api_keys = get_api_keys()
         if api_keys["status"] == "error":
             raise ValueError(api_keys["message"])
+        # Initialize embeddings
+        self.embeddings = OpenAIEmbeddings(openai_api_key=api_keys["openai_key"])
         if provider == "bedrock":
             # Initialize AWS Bedrock client
             try:
         except Exception as e:
             return f"Error processing query: {str(e)}"
+    def process_documents(self, file_paths):
+        """Process multiple documents and return results."""
+        results = {}
+        for file_path in file_paths:
             try:
+                # Get file extension
+                file_ext = os.path.splitext(file_path.lower())[1]
+                # Validate file extension
+                supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls', '.txt']
+                if file_ext not in supported_exts:
+                    results[file_path] = f"Unsupported file type: {file_ext}"
+                    continue
+                # Read file content
+                with open(file_path, 'rb') as f:
+                    content = f.read()
+                # Process document based on type
+                documents = self.process_document(content, file_ext)
+                # Create vector store with the documents
+                if documents:
+                    if not self.document_store:
+                        self.document_store = FAISS.from_documents(documents, self.embeddings)
+                    else:
+                        # Add to existing store
+                        self.document_store.add_documents(documents)
+                    num_chunks = len(documents)
+                    results[file_path] = f"Success ({num_chunks} chunks extracted)"
+                else:
+                    results[file_path] = "No content could be extracted"
+            except Exception as e:
+                logging.error(f"Error processing document {file_path}: {str(e)}")
+                results[file_path] = str(e)
+        return results
+    def process_document(self, content, doc_type):
+        """Process document content based on type."""
+        with tempfile.NamedTemporaryFile(delete=False, suffix=doc_type) as temp_file:
+            temp_file.write(content)
+            temp_file_path = temp_file.name
+        try:
+            documents = self.load_document(temp_file_path)
+            return self.split_documents(documents)
+        finally:
+            if os.path.exists(temp_file_path):
+                os.unlink(temp_file_path)
+    def load_document(self, file_path):
+        """Load document using appropriate loader with OCR fallback for PDFs."""
+        file_path = Path(file_path)
+        suffix = file_path.suffix.lower()
+        if suffix == '.pdf':
+            # Try normal PDF loading first
+            try:
+                loader = PyPDFLoader(str(file_path))
+                documents = loader.load()
+                if not any(doc.page_content.strip() for doc in documents):
+                    raise ValueError("No text content found")
+                return documents
+            except Exception as e:
+                logging.warning(f"Standard PDF extraction failed: {str(e)}")
+                # If normal loading fails, try OCR
+                if OCR_AVAILABLE:
+                    logging.info("Attempting PDF extraction with OCR")
+                    return self._process_pdf_with_ocr(file_path)
+                else:
+                    raise ValueError("PDF extraction failed and OCR is not available")
+        elif suffix == '.docx':
+            loader = Docx2txtLoader(str(file_path))
+            return loader.load()
+        elif suffix == '.pptx':
+            loader = UnstructuredPowerPointLoader(str(file_path))
+            return loader.load()
+        elif suffix in ['.xlsx', '.xls']:
+            loader = UnstructuredExcelLoader(str(file_path))
+            return loader.load()
+        elif suffix == '.txt':
+            loader = TextLoader(str(file_path))
+            return loader.load()
+        else:
+            raise ValueError(f"Unsupported file type: {suffix}")
+    def _process_pdf_with_ocr(self, file_path):
+        """Process PDF with OCR using Tesseract."""
+        if not OCR_AVAILABLE:
+            raise ImportError("pdf2image and pytesseract required for OCR processing")
+        documents = []
+        images = convert_from_path(str(file_path))
+        for i, image in enumerate(images):
+            text = pytesseract.image_to_string(image)
+            if text.strip():
+                documents.append(Document(
+                    page_content=text,
+                    metadata={"source": str(file_path), "page": i + 1}
+                ))
+        return documents
+    def split_documents(self, documents):
+        """Split documents into chunks."""
+        return self.text_splitter.split_documents(documents)
     def query_documents(self, query):
         """Query the processed documents."""
         # Status indicator for initialization and operations
         status_message = gr.Textbox(label="Status", value="Ready")
         # Use tabs for model selection instead of dropdown
         with gr.Tabs() as model_tabs:
             model_tab_dict = {}
                     model_tab_dict[model_id] = tab
         with gr.Tabs() as feature_tabs:
+            # Chat interface with history
+            with gr.Tab("💬 Conversation"):
+                chat_history = gr.Chatbot(height=400)
                 chat_input = gr.Textbox(
+                    lines=3,
                     label="Ask your audit question",
                     placeholder="Enter your question here..."
                 )
+                chat_clear = gr.Button("Clear Chat")
                 chat_button = gr.Button("Send")
             with gr.Tab("🔢 Numerical Problem"):
                 problem_input = gr.Textbox(
                 solve_button = gr.Button("Solve")
                 solution_output = gr.Markdown(label="Solution")
+            # Document processing tab
+            with gr.Tab("📑 Document Processing"):
+                with gr.Row():
+                    file_upload = gr.File(
+                        file_count="multiple",
+                        label="Upload Audit Documents (PDF, DOCX, PPTX, TXT, XLSX)",
+                        # Let's not restrict file types in the UI to avoid validation errors
+                        type="filepath"
+                    )
+                upload_button = gr.Button("Process Documents")
+                upload_output = gr.Textbox(label="Processing Status", lines=10)
+            # Document query tab
+            with gr.Tab("🔍 Document Query"):
                 query_input = gr.Textbox(
                     lines=3,
                     label="Query Documents",
                 error_message = f"Error initializing {model_name}: {str(e)}"
                 logging.error(error_message)
                 return None, error_message
+        # Handle chat with history
+        def respond_to_chat(message, history, model_name):
+            if not message.strip():
+                return "", history
             # Get or initialize agent
             agent, init_status = get_or_initialize_agent(model_name)
             # If initialization failed
             if agent is None:
+                history.append((message, f"Could not initialize {model_name}. {init_status}"))
+                return "", history, f"Error: {init_status}"
             # Process the query
             try:
+                result = agent.process_query(message)
+                history.append((message, result))
+                return "", history, f"Response from {model_name}"
             except Exception as e:
+                error_msg = f"Error: {str(e)}"
+                history.append((message, error_msg))
+                return "", history, error_msg
+        # Clear chat history
+        def clear_chat_history():
+            return [], "Chat history cleared"
         # Handle numerical problem
         def handle_problem(problem, model_name):
+            if not problem.strip():
+                return "Please provide a problem description", "No problem entered"
             status = f"Solving problem with {model_name}..."
             # Get or initialize agent
                 error_msg = f"Error solving problem: {str(e)}"
                 return error_msg, error_msg
+        # Improved file upload handler for multiple files
+        def handle_file_upload(file_paths, model_name):
+            if not file_paths:
+                return "No files uploaded. Please upload files."
+            # Get or initialize agent
+            agent, init_status = get_or_initialize_agent(model_name)
+            # If initialization failed
+            if agent is None:
+                return init_status
+            logging.info(f"Processing {len(file_paths)} files")
+            # Process all documents
+            try:
+                results = agent.process_documents(file_paths)
+                # Format results
+                output_lines = ["## Document Processing Results"]
+                for file_path, status in results.items():
+                    file_name = os.path.basename(file_path)
+                    if "Success" in status:
+                        output_lines.append(f"✓ {file_name}: {status}")
+                    else:
+                        output_lines.append(f"❌ {file_name}: {status}")
+                if any("Success" in status for status in results.values()):
+                    output_lines.append("\n✅ Documents are ready for querying!")
+                return "\n".join(output_lines)
             except Exception as e:
+                logging.error(f"File upload error: {str(e)}")
+                return f"Error processing files: {str(e)}"
         # Handle document query
         def handle_query(query, model_name):
+            if not query.strip():
+                return "Please provide a query", "No query entered"
             status = f"Querying documents with {model_name}..."
             # Get or initialize agent
         # Set up event handlers
         chat_button.click(
+            respond_to_chat,
+            inputs=[chat_input, chat_history, selected_model],
+            outputs=[chat_input, chat_history, status_message]
+        )
+        chat_clear.click(
+            clear_chat_history,
+            outputs=[chat_history, status_message]
         )
         solve_button.click(
             outputs=[solution_output, status_message]
         )
+        upload_button.click(
             handle_file_upload,
             inputs=[file_upload, selected_model],
+            outputs=[upload_output]
         )
         query_button.click(