prernajeet01 commited on
Commit
898186b
·
verified ·
1 Parent(s): 8079a62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -106
app.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import tempfile
4
  import pandas as pd
5
  import boto3
6
- from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredPowerPointLoader, UnstructuredExcelLoader
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_community.embeddings import OpenAIEmbeddings
9
  from langchain_community.vectorstores import FAISS
@@ -11,9 +11,18 @@ from langchain.chains import RetrievalQA
11
  from langchain_community.chat_models import BedrockChat
12
  from langchain_openai import ChatOpenAI
13
  from langchain_community.llms import Ollama
 
 
 
14
  import logging
15
- from huggingface_hub import HfApi
16
- from huggingface_hub.utils import RepositoryNotFoundError
 
 
 
 
 
 
17
 
18
  # Set up logging
19
  logging.basicConfig(
@@ -48,11 +57,20 @@ class AuditAgent:
48
  self.provider = provider
49
  self.document_store = None
50
 
 
 
 
 
 
 
51
  # Get API keys
52
  api_keys = get_api_keys()
53
  if api_keys["status"] == "error":
54
  raise ValueError(api_keys["message"])
55
 
 
 
 
56
  if provider == "bedrock":
57
  # Initialize AWS Bedrock client
58
  try:
@@ -117,58 +135,116 @@ class AuditAgent:
117
  except Exception as e:
118
  return f"Error processing query: {str(e)}"
119
 
120
- def process_documents(self, file_path, file_name):
121
- """Process uploaded documents and create a vector store."""
122
- if not file_path or not file_name:
123
- return "Please upload a file"
124
-
125
- try:
126
- documents = []
127
-
128
- # Get file extension and check it's supported
129
- file_ext = os.path.splitext(file_name.lower())[1]
130
- supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']
131
-
132
- if file_ext not in supported_exts:
133
- return f"Unsupported file type: {file_ext}. Please upload one of: {', '.join(supported_exts)}"
134
-
135
- # Select appropriate loader
136
  try:
137
- if file_ext == '.pdf':
138
- loader = PyPDFLoader(file_path)
139
- elif file_ext == '.docx':
140
- loader = Docx2txtLoader(file_path)
141
- elif file_ext == '.pptx':
142
- loader = UnstructuredPowerPointLoader(file_path)
143
- elif file_ext in ['.xlsx', '.xls']:
144
- loader = UnstructuredExcelLoader(file_path)
145
 
146
- # Load and process document
147
- documents.extend(loader.load())
148
- except Exception as e:
149
- return f"Error loading document content: {str(e)}"
150
-
151
- # Split documents
152
- if not documents:
153
- return "No content could be extracted from the document."
154
 
155
- text_splitter = RecursiveCharacterTextSplitter(
156
- chunk_size=1000,
157
- chunk_overlap=200
158
- )
159
- splits = text_splitter.split_documents(documents)
160
-
161
- if not splits:
162
- return "Document was processed but no text content was found."
163
-
164
- # Create vector store
165
- api_keys = get_api_keys()
166
- embeddings = OpenAIEmbeddings(openai_api_key=api_keys["openai_key"])
167
- self.document_store = FAISS.from_documents(splits, embeddings)
168
-
169
- return f"Document '{file_name}' processed successfully with {len(splits)} text chunks."
170
- except Exception as e:
171
- return f"Error processing document: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  def query_documents(self, query):
174
  """Query the processed documents."""
@@ -252,16 +328,6 @@ def create_interface():
252
  # Status indicator for initialization and operations
253
  status_message = gr.Textbox(label="Status", value="Ready")
254
 
255
- with gr.Row():
256
- with gr.Column(scale=1):
257
- # Updated file upload component - using file type instead of binary
258
- file_upload = gr.File(
259
- label="Upload Audit Documents",
260
- file_types=["pdf", "docx", "pptx", "xlsx", "xls"],
261
- type="filepath" # Changed from "binary" to "filepath"
262
- )
263
- gr.Markdown("Supported formats: PDF, DOCX, PPTX, XLSX, XLS")
264
-
265
  # Use tabs for model selection instead of dropdown
266
  with gr.Tabs() as model_tabs:
267
  model_tab_dict = {}
@@ -270,14 +336,16 @@ def create_interface():
270
  model_tab_dict[model_id] = tab
271
 
272
  with gr.Tabs() as feature_tabs:
273
- with gr.Tab("💬 General Chat"):
 
 
274
  chat_input = gr.Textbox(
275
- lines=3,
276
  label="Ask your audit question",
277
  placeholder="Enter your question here..."
278
  )
 
279
  chat_button = gr.Button("Send")
280
- chat_output = gr.Markdown(label="Response")
281
 
282
  with gr.Tab("🔢 Numerical Problem"):
283
  problem_input = gr.Textbox(
@@ -288,7 +356,20 @@ def create_interface():
288
  solve_button = gr.Button("Solve")
289
  solution_output = gr.Markdown(label="Solution")
290
 
291
- with gr.Tab("📑 Document Query"):
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  query_input = gr.Textbox(
293
  lines=3,
294
  label="Query Documents",
@@ -331,29 +412,39 @@ def create_interface():
331
  error_message = f"Error initializing {model_name}: {str(e)}"
332
  logging.error(error_message)
333
  return None, error_message
334
-
335
- # Handle chat separately
336
- def handle_chat(query, model_name):
337
- # First update status message
338
- status = f"Processing query with {model_name}..."
339
-
340
  # Get or initialize agent
341
  agent, init_status = get_or_initialize_agent(model_name)
342
 
343
  # If initialization failed
344
  if agent is None:
345
- return f"Could not initialize {model_name}. {init_status}", init_status
 
346
 
347
  # Process the query
348
  try:
349
- result = agent.process_query(query)
350
- return result, f"Query processed with {model_name}"
 
351
  except Exception as e:
352
- error_msg = f"Error processing query: {str(e)}"
353
- return error_msg, error_msg
 
 
 
 
 
354
 
355
  # Handle numerical problem
356
  def handle_problem(problem, model_name):
 
 
 
357
  status = f"Solving problem with {model_name}..."
358
 
359
  # Get or initialize agent
@@ -371,39 +462,46 @@ def create_interface():
371
  error_msg = f"Error solving problem: {str(e)}"
372
  return error_msg, error_msg
373
 
374
- # Updated file upload handler for filepath type
375
- def handle_file_upload(file_path, model_name):
376
- if file_path is None:
377
- return "No file uploaded. Please upload a file."
378
-
379
- try:
380
- # Extract the filename from the path
381
- file_name = os.path.basename(file_path)
382
 
383
- # Check file extension
384
- file_ext = os.path.splitext(file_name.lower())[1]
385
- supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']
 
 
 
386
 
387
- if file_ext not in supported_exts:
388
- return f"Invalid file type: {file_ext}. Please upload a file with one of these extensions: {', '.join(supported_exts)}"
389
-
390
- status = f"Processing document with {model_name}..."
 
391
 
392
- # Get or initialize agent
393
- agent, init_status = get_or_initialize_agent(model_name)
 
 
 
 
 
 
394
 
395
- # If initialization failed
396
- if agent is None:
397
- return init_status
398
 
399
- # Process the document
400
- result = agent.process_documents(file_path, file_name)
401
- return result
402
  except Exception as e:
403
- return f"Error processing document: {str(e)}"
 
404
 
405
  # Handle document query
406
  def handle_query(query, model_name):
 
 
 
407
  status = f"Querying documents with {model_name}..."
408
 
409
  # Get or initialize agent
@@ -423,9 +521,14 @@ def create_interface():
423
 
424
  # Set up event handlers
425
  chat_button.click(
426
- handle_chat,
427
- inputs=[chat_input, selected_model],
428
- outputs=[chat_output, status_message]
 
 
 
 
 
429
  )
430
 
431
  solve_button.click(
@@ -434,10 +537,10 @@ def create_interface():
434
  outputs=[solution_output, status_message]
435
  )
436
 
437
- file_upload.upload(
438
  handle_file_upload,
439
  inputs=[file_upload, selected_model],
440
- outputs=[status_message]
441
  )
442
 
443
  query_button.click(
 
3
  import tempfile
4
  import pandas as pd
5
  import boto3
6
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredPowerPointLoader, UnstructuredExcelLoader, TextLoader
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_community.embeddings import OpenAIEmbeddings
9
  from langchain_community.vectorstores import FAISS
 
11
  from langchain_community.chat_models import BedrockChat
12
  from langchain_openai import ChatOpenAI
13
  from langchain_community.llms import Ollama
14
+ from langchain.schema import Document
15
+ from pathlib import Path
16
+ from typing import List, Union
17
  import logging
18
+
19
+ # Optional OCR support
20
+ try:
21
+ from pdf2image import convert_from_path
22
+ import pytesseract
23
+ OCR_AVAILABLE = True
24
+ except ImportError:
25
+ OCR_AVAILABLE = False
26
 
27
  # Set up logging
28
  logging.basicConfig(
 
57
  self.provider = provider
58
  self.document_store = None
59
 
60
+ # Initialize text splitter
61
+ self.text_splitter = RecursiveCharacterTextSplitter(
62
+ chunk_size=1000,
63
+ chunk_overlap=200
64
+ )
65
+
66
  # Get API keys
67
  api_keys = get_api_keys()
68
  if api_keys["status"] == "error":
69
  raise ValueError(api_keys["message"])
70
 
71
+ # Initialize embeddings
72
+ self.embeddings = OpenAIEmbeddings(openai_api_key=api_keys["openai_key"])
73
+
74
  if provider == "bedrock":
75
  # Initialize AWS Bedrock client
76
  try:
 
135
  except Exception as e:
136
  return f"Error processing query: {str(e)}"
137
 
138
+ def process_documents(self, file_paths):
139
+ """Process multiple documents and return results."""
140
+ results = {}
141
+
142
+ for file_path in file_paths:
 
 
 
 
 
 
 
 
 
 
 
143
  try:
144
+ # Get file extension
145
+ file_ext = os.path.splitext(file_path.lower())[1]
 
 
 
 
 
 
146
 
147
+ # Validate file extension
148
+ supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls', '.txt']
149
+ if file_ext not in supported_exts:
150
+ results[file_path] = f"Unsupported file type: {file_ext}"
151
+ continue
 
 
 
152
 
153
+ # Read file content
154
+ with open(file_path, 'rb') as f:
155
+ content = f.read()
156
+
157
+ # Process document based on type
158
+ documents = self.process_document(content, file_ext)
159
+
160
+ # Create vector store with the documents
161
+ if documents:
162
+ if not self.document_store:
163
+ self.document_store = FAISS.from_documents(documents, self.embeddings)
164
+ else:
165
+ # Add to existing store
166
+ self.document_store.add_documents(documents)
167
+
168
+ num_chunks = len(documents)
169
+ results[file_path] = f"Success ({num_chunks} chunks extracted)"
170
+ else:
171
+ results[file_path] = "No content could be extracted"
172
+ except Exception as e:
173
+ logging.error(f"Error processing document {file_path}: {str(e)}")
174
+ results[file_path] = str(e)
175
+
176
+ return results
177
+
178
+ def process_document(self, content, doc_type):
179
+ """Process document content based on type."""
180
+ with tempfile.NamedTemporaryFile(delete=False, suffix=doc_type) as temp_file:
181
+ temp_file.write(content)
182
+ temp_file_path = temp_file.name
183
+
184
+ try:
185
+ documents = self.load_document(temp_file_path)
186
+ return self.split_documents(documents)
187
+ finally:
188
+ if os.path.exists(temp_file_path):
189
+ os.unlink(temp_file_path)
190
+
191
+ def load_document(self, file_path):
192
+ """Load document using appropriate loader with OCR fallback for PDFs."""
193
+ file_path = Path(file_path)
194
+ suffix = file_path.suffix.lower()
195
+
196
+ if suffix == '.pdf':
197
+ # Try normal PDF loading first
198
+ try:
199
+ loader = PyPDFLoader(str(file_path))
200
+ documents = loader.load()
201
+ if not any(doc.page_content.strip() for doc in documents):
202
+ raise ValueError("No text content found")
203
+ return documents
204
+ except Exception as e:
205
+ logging.warning(f"Standard PDF extraction failed: {str(e)}")
206
+ # If normal loading fails, try OCR
207
+ if OCR_AVAILABLE:
208
+ logging.info("Attempting PDF extraction with OCR")
209
+ return self._process_pdf_with_ocr(file_path)
210
+ else:
211
+ raise ValueError("PDF extraction failed and OCR is not available")
212
+ elif suffix == '.docx':
213
+ loader = Docx2txtLoader(str(file_path))
214
+ return loader.load()
215
+ elif suffix == '.pptx':
216
+ loader = UnstructuredPowerPointLoader(str(file_path))
217
+ return loader.load()
218
+ elif suffix in ['.xlsx', '.xls']:
219
+ loader = UnstructuredExcelLoader(str(file_path))
220
+ return loader.load()
221
+ elif suffix == '.txt':
222
+ loader = TextLoader(str(file_path))
223
+ return loader.load()
224
+ else:
225
+ raise ValueError(f"Unsupported file type: {suffix}")
226
+
227
+ def _process_pdf_with_ocr(self, file_path):
228
+ """Process PDF with OCR using Tesseract."""
229
+ if not OCR_AVAILABLE:
230
+ raise ImportError("pdf2image and pytesseract required for OCR processing")
231
+
232
+ documents = []
233
+ images = convert_from_path(str(file_path))
234
+
235
+ for i, image in enumerate(images):
236
+ text = pytesseract.image_to_string(image)
237
+ if text.strip():
238
+ documents.append(Document(
239
+ page_content=text,
240
+ metadata={"source": str(file_path), "page": i + 1}
241
+ ))
242
+
243
+ return documents
244
+
245
+ def split_documents(self, documents):
246
+ """Split documents into chunks."""
247
+ return self.text_splitter.split_documents(documents)
248
 
249
  def query_documents(self, query):
250
  """Query the processed documents."""
 
328
  # Status indicator for initialization and operations
329
  status_message = gr.Textbox(label="Status", value="Ready")
330
 
 
 
 
 
 
 
 
 
 
 
331
  # Use tabs for model selection instead of dropdown
332
  with gr.Tabs() as model_tabs:
333
  model_tab_dict = {}
 
336
  model_tab_dict[model_id] = tab
337
 
338
  with gr.Tabs() as feature_tabs:
339
+ # Chat interface with history
340
+ with gr.Tab("💬 Conversation"):
341
+ chat_history = gr.Chatbot(height=400)
342
  chat_input = gr.Textbox(
343
+ lines=3,
344
  label="Ask your audit question",
345
  placeholder="Enter your question here..."
346
  )
347
+ chat_clear = gr.Button("Clear Chat")
348
  chat_button = gr.Button("Send")
 
349
 
350
  with gr.Tab("🔢 Numerical Problem"):
351
  problem_input = gr.Textbox(
 
356
  solve_button = gr.Button("Solve")
357
  solution_output = gr.Markdown(label="Solution")
358
 
359
+ # Document processing tab
360
+ with gr.Tab("📑 Document Processing"):
361
+ with gr.Row():
362
+ file_upload = gr.File(
363
+ file_count="multiple",
364
+ label="Upload Audit Documents (PDF, DOCX, PPTX, TXT, XLSX)",
365
+ # Let's not restrict file types in the UI to avoid validation errors
366
+ type="filepath"
367
+ )
368
+ upload_button = gr.Button("Process Documents")
369
+ upload_output = gr.Textbox(label="Processing Status", lines=10)
370
+
371
+ # Document query tab
372
+ with gr.Tab("🔍 Document Query"):
373
  query_input = gr.Textbox(
374
  lines=3,
375
  label="Query Documents",
 
412
  error_message = f"Error initializing {model_name}: {str(e)}"
413
  logging.error(error_message)
414
  return None, error_message
415
+
416
+ # Handle chat with history
417
+ def respond_to_chat(message, history, model_name):
418
+ if not message.strip():
419
+ return "", history
420
+
421
  # Get or initialize agent
422
  agent, init_status = get_or_initialize_agent(model_name)
423
 
424
  # If initialization failed
425
  if agent is None:
426
+ history.append((message, f"Could not initialize {model_name}. {init_status}"))
427
+ return "", history, f"Error: {init_status}"
428
 
429
  # Process the query
430
  try:
431
+ result = agent.process_query(message)
432
+ history.append((message, result))
433
+ return "", history, f"Response from {model_name}"
434
  except Exception as e:
435
+ error_msg = f"Error: {str(e)}"
436
+ history.append((message, error_msg))
437
+ return "", history, error_msg
438
+
439
+ # Clear chat history
440
+ def clear_chat_history():
441
+ return [], "Chat history cleared"
442
 
443
  # Handle numerical problem
444
  def handle_problem(problem, model_name):
445
+ if not problem.strip():
446
+ return "Please provide a problem description", "No problem entered"
447
+
448
  status = f"Solving problem with {model_name}..."
449
 
450
  # Get or initialize agent
 
462
  error_msg = f"Error solving problem: {str(e)}"
463
  return error_msg, error_msg
464
 
465
+ # Improved file upload handler for multiple files
466
+ def handle_file_upload(file_paths, model_name):
467
+ if not file_paths:
468
+ return "No files uploaded. Please upload files."
 
 
 
 
469
 
470
+ # Get or initialize agent
471
+ agent, init_status = get_or_initialize_agent(model_name)
472
+
473
+ # If initialization failed
474
+ if agent is None:
475
+ return init_status
476
 
477
+ logging.info(f"Processing {len(file_paths)} files")
478
+
479
+ # Process all documents
480
+ try:
481
+ results = agent.process_documents(file_paths)
482
 
483
+ # Format results
484
+ output_lines = ["## Document Processing Results"]
485
+ for file_path, status in results.items():
486
+ file_name = os.path.basename(file_path)
487
+ if "Success" in status:
488
+ output_lines.append(f"✓ {file_name}: {status}")
489
+ else:
490
+ output_lines.append(f"❌ {file_name}: {status}")
491
 
492
+ if any("Success" in status for status in results.values()):
493
+ output_lines.append("\n✅ Documents are ready for querying!")
 
494
 
495
+ return "\n".join(output_lines)
 
 
496
  except Exception as e:
497
+ logging.error(f"File upload error: {str(e)}")
498
+ return f"Error processing files: {str(e)}"
499
 
500
  # Handle document query
501
  def handle_query(query, model_name):
502
+ if not query.strip():
503
+ return "Please provide a query", "No query entered"
504
+
505
  status = f"Querying documents with {model_name}..."
506
 
507
  # Get or initialize agent
 
521
 
522
  # Set up event handlers
523
  chat_button.click(
524
+ respond_to_chat,
525
+ inputs=[chat_input, chat_history, selected_model],
526
+ outputs=[chat_input, chat_history, status_message]
527
+ )
528
+
529
+ chat_clear.click(
530
+ clear_chat_history,
531
+ outputs=[chat_history, status_message]
532
  )
533
 
534
  solve_button.click(
 
537
  outputs=[solution_output, status_message]
538
  )
539
 
540
+ upload_button.click(
541
  handle_file_upload,
542
  inputs=[file_upload, selected_model],
543
+ outputs=[upload_output]
544
  )
545
 
546
  query_button.click(