prernajeet01 commited on
Commit
dff05f0
·
verified ·
1 Parent(s): 783e7ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -25
app.py CHANGED
@@ -138,40 +138,59 @@ class AuditAgent:
138
  supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']
139
 
140
  if file_ext not in supported_exts:
141
- os.remove(temp_path)
142
- os.rmdir(temp_dir)
143
- return f"Unsupported file type. Please upload one of: {', '.join(supported_exts)}"
 
 
 
144
 
145
  # Select appropriate loader
146
- if file_ext == '.pdf':
147
- loader = PyPDFLoader(temp_path)
148
- elif file_ext == '.docx':
149
- loader = Docx2txtLoader(temp_path)
150
- elif file_ext == '.pptx':
151
- loader = UnstructuredPowerPointLoader(temp_path)
152
- elif file_ext in ['.xlsx', '.xls']:
153
- loader = UnstructuredExcelLoader(temp_path)
154
-
155
- # Load and process document
156
- documents.extend(loader.load())
157
-
158
- # Cleanup
159
- os.remove(temp_path)
160
- os.rmdir(temp_dir)
 
 
 
 
 
 
 
 
 
 
161
 
162
  # Split documents
 
 
 
163
  text_splitter = RecursiveCharacterTextSplitter(
164
  chunk_size=1000,
165
  chunk_overlap=200
166
  )
167
  splits = text_splitter.split_documents(documents)
168
 
 
 
 
169
  # Create vector store
170
  api_keys = get_api_keys()
171
  embeddings = OpenAIEmbeddings(openai_api_key=api_keys["openai_key"])
172
  self.document_store = FAISS.from_documents(splits, embeddings)
173
 
174
- return "Document processed successfully"
175
  except Exception as e:
176
  return f"Error processing document: {str(e)}"
177
 
@@ -180,6 +199,9 @@ class AuditAgent:
180
  if not self.document_store:
181
  return "Please upload and process documents first"
182
 
 
 
 
183
  try:
184
  qa_chain = RetrievalQA.from_chain_type(
185
  llm=self.llm,
@@ -194,9 +216,9 @@ class AuditAgent:
194
  source_docs = response.get('source_documents', [])
195
 
196
  if source_docs:
197
- result += "\n\nSources:\n"
198
  for i, doc in enumerate(source_docs, 1):
199
- result += f"{i}. {doc.metadata.get('source', 'Unknown source')}\n"
200
 
201
  return result
202
  except Exception as e:
@@ -256,10 +278,13 @@ def create_interface():
256
 
257
  with gr.Row():
258
  with gr.Column(scale=1):
 
259
  file_upload = gr.File(
260
  label="Upload Audit Documents",
261
- file_types=["pdf", "docx", "pptx", "xlsx", "xls"]
 
262
  )
 
263
 
264
  # Use tabs for model selection instead of dropdown
265
  with gr.Tabs() as model_tabs:
@@ -308,7 +333,7 @@ def create_interface():
308
 
309
  model_tabs.select(update_selected_model, outputs=[selected_model])
310
 
311
- # COMPLETELY REVISED: Initialize an agent and return both agent and status message
312
  def get_or_initialize_agent(model_name):
313
  """Initialize an agent if not already initialized and return a status message"""
314
  init_message = f"Initializing {model_name}..."
@@ -370,11 +395,18 @@ def create_interface():
370
  error_msg = f"Error solving problem: {str(e)}"
371
  return error_msg, error_msg
372
 
373
- # Handle file upload
374
  def handle_file_upload(file, model_name):
375
  if file is None:
376
  return "No file uploaded. Please upload a file."
377
 
 
 
 
 
 
 
 
378
  status = f"Processing document with {model_name}..."
379
 
380
  # Get or initialize agent
@@ -410,7 +442,7 @@ def create_interface():
410
  error_msg = f"Error querying documents: {str(e)}"
411
  return error_msg, error_msg
412
 
413
- # Set up event handlers - UPDATED to include status_message updates
414
  chat_button.click(
415
  handle_chat,
416
  inputs=[chat_input, selected_model],
 
138
  supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']
139
 
140
  if file_ext not in supported_exts:
141
+ # Clean up temp files before returning
142
+ if os.path.exists(temp_path):
143
+ os.remove(temp_path)
144
+ if os.path.exists(temp_dir):
145
+ os.rmdir(temp_dir)
146
+ return f"Unsupported file type: {file_ext}. Please upload one of: {', '.join(supported_exts)}"
147
 
148
  # Select appropriate loader
149
+ try:
150
+ if file_ext == '.pdf':
151
+ loader = PyPDFLoader(temp_path)
152
+ elif file_ext == '.docx':
153
+ loader = Docx2txtLoader(temp_path)
154
+ elif file_ext == '.pptx':
155
+ loader = UnstructuredPowerPointLoader(temp_path)
156
+ elif file_ext in ['.xlsx', '.xls']:
157
+ loader = UnstructuredExcelLoader(temp_path)
158
+
159
+ # Load and process document
160
+ documents.extend(loader.load())
161
+ except Exception as e:
162
+ # Clean up temp files
163
+ if os.path.exists(temp_path):
164
+ os.remove(temp_path)
165
+ if os.path.exists(temp_dir):
166
+ os.rmdir(temp_dir)
167
+ return f"Error loading document content: {str(e)}"
168
+
169
+ # Cleanup temp files
170
+ if os.path.exists(temp_path):
171
+ os.remove(temp_path)
172
+ if os.path.exists(temp_dir):
173
+ os.rmdir(temp_dir)
174
 
175
  # Split documents
176
+ if not documents:
177
+ return "No content could be extracted from the document."
178
+
179
  text_splitter = RecursiveCharacterTextSplitter(
180
  chunk_size=1000,
181
  chunk_overlap=200
182
  )
183
  splits = text_splitter.split_documents(documents)
184
 
185
+ if not splits:
186
+ return "Document was processed but no text content was found."
187
+
188
  # Create vector store
189
  api_keys = get_api_keys()
190
  embeddings = OpenAIEmbeddings(openai_api_key=api_keys["openai_key"])
191
  self.document_store = FAISS.from_documents(splits, embeddings)
192
 
193
+ return f"Document '{file.name}' processed successfully with {len(splits)} text chunks."
194
  except Exception as e:
195
  return f"Error processing document: {str(e)}"
196
 
 
199
  if not self.document_store:
200
  return "Please upload and process documents first"
201
 
202
+ if not query.strip():
203
+ return "Please provide a non-empty query."
204
+
205
  try:
206
  qa_chain = RetrievalQA.from_chain_type(
207
  llm=self.llm,
 
216
  source_docs = response.get('source_documents', [])
217
 
218
  if source_docs:
219
+ result += "\n\n**Sources:**\n"
220
  for i, doc in enumerate(source_docs, 1):
221
+ result += f"{i}. {doc.metadata.get('source', 'Unknown source')}, page {doc.metadata.get('page', 'N/A')}\n"
222
 
223
  return result
224
  except Exception as e:
 
278
 
279
  with gr.Row():
280
  with gr.Column(scale=1):
281
+ # Updated file component with clearer instructions
282
  file_upload = gr.File(
283
  label="Upload Audit Documents",
284
+ file_types=["pdf", "docx", "pptx", "xlsx", "xls"],
285
+ type="binary"
286
  )
287
+ gr.Markdown("Supported formats: PDF, DOCX, PPTX, XLSX, XLS")
288
 
289
  # Use tabs for model selection instead of dropdown
290
  with gr.Tabs() as model_tabs:
 
333
 
334
  model_tabs.select(update_selected_model, outputs=[selected_model])
335
 
336
+ # Get or initialize agent and return both agent and status message
337
  def get_or_initialize_agent(model_name):
338
  """Initialize an agent if not already initialized and return a status message"""
339
  init_message = f"Initializing {model_name}..."
 
395
  error_msg = f"Error solving problem: {str(e)}"
396
  return error_msg, error_msg
397
 
398
+ # Handle file upload with improved validation
399
  def handle_file_upload(file, model_name):
400
  if file is None:
401
  return "No file uploaded. Please upload a file."
402
 
403
+ # Check file extension
404
+ file_ext = os.path.splitext(file.name.lower())[1] if file.name else ""
405
+ supported_exts = ['.pdf', '.docx', '.pptx', '.xlsx', '.xls']
406
+
407
+ if file_ext not in supported_exts:
408
+ return f"Invalid file type: {file_ext}. Please upload a file with one of these extensions: {', '.join(supported_exts)}"
409
+
410
  status = f"Processing document with {model_name}..."
411
 
412
  # Get or initialize agent
 
442
  error_msg = f"Error querying documents: {str(e)}"
443
  return error_msg, error_msg
444
 
445
+ # Set up event handlers
446
  chat_button.click(
447
  handle_chat,
448
  inputs=[chat_input, selected_model],