prernajeet01 commited on
Commit
56f99a9
·
verified ·
1 Parent(s): 567ec8e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +423 -0
app.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import tempfile
4
+ import warnings
5
+ from pathlib import Path
6
+ from typing import List, Dict, Any, Set, Union
7
+ from datetime import datetime
8
+ import pytesseract
9
+ from pdf2image import convert_from_path
10
+ import numpy as np
11
+
12
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
13
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
14
+ from langchain_core.output_parsers import StrOutputParser
15
+ from langchain_core.prompts import ChatPromptTemplate
16
+ from langchain_core.runnables import RunnablePassthrough
17
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
18
+ from langchain_community.vectorstores import FAISS
19
+ from langchain_core.documents import Document
20
+ from langchain.chains import ConversationalRetrievalChain
21
+ from langchain.memory import ConversationBufferMemory
22
+
23
+ # Suppress warnings
24
+ warnings.filterwarnings("ignore", category=FutureWarning)
25
+
26
+ class RiskLevel:
27
+ LOW = "Low"
28
+ MEDIUM = "Medium"
29
+ HIGH = "High"
30
+ CRITICAL = "Critical"
31
+
32
+ class DocumentProcessor:
33
+ """Enhanced document processing with OCR support."""
34
+
35
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
36
+ self.chunk_size = chunk_size
37
+ self.chunk_overlap = chunk_overlap
38
+ self.text_splitter = RecursiveCharacterTextSplitter(
39
+ chunk_size=chunk_size,
40
+ chunk_overlap=chunk_overlap
41
+ )
42
+
43
+ def process_document(self, content: bytes, doc_type: str) -> List[Document]:
44
+ """Process document content based on type."""
45
+ with tempfile.NamedTemporaryFile(delete=False, suffix=doc_type) as temp_file:
46
+ temp_file.write(content)
47
+ temp_file_path = temp_file.name
48
+
49
+ try:
50
+ documents = self.load_document(temp_file_path)
51
+ return self.split_documents(documents)
52
+ finally:
53
+ os.unlink(temp_file_path)
54
+
55
+ def load_document(self, file_path: Union[str, Path]) -> List[Document]:
56
+ """Load document using appropriate loader with OCR support."""
57
+ file_path = Path(file_path)
58
+ suffix = file_path.suffix.lower()
59
+
60
+ if suffix == '.pdf':
61
+ # Try normal PDF loading first
62
+ try:
63
+ loader = PyPDFLoader(str(file_path))
64
+ documents = loader.load()
65
+ if not any(doc.page_content.strip() for doc in documents):
66
+ raise ValueError("No text content found")
67
+ return documents
68
+ except:
69
+ # If normal loading fails, try OCR
70
+ return self._process_pdf_with_ocr(file_path)
71
+ elif suffix == '.docx':
72
+ loader = Docx2txtLoader(str(file_path))
73
+ return loader.load()
74
+ elif suffix == '.txt':
75
+ loader = TextLoader(str(file_path))
76
+ return loader.load()
77
+ else:
78
+ raise ValueError(f"Unsupported file type: {suffix}")
79
+
80
+ def _process_pdf_with_ocr(self, file_path: Path) -> List[Document]:
81
+ """Process PDF with OCR using Tesseract."""
82
+ documents = []
83
+ images = convert_from_path(str(file_path))
84
+
85
+ for i, image in enumerate(images):
86
+ text = pytesseract.image_to_string(image)
87
+ if text.strip():
88
+ documents.append(Document(
89
+ page_content=text,
90
+ metadata={"source": str(file_path), "page": i + 1}
91
+ ))
92
+
93
+ return documents
94
+
95
+ def split_documents(self, documents: List[Document]) -> List[Document]:
96
+ """Split documents into chunks."""
97
+ return self.text_splitter.split_documents(documents)
98
+
99
+ class AuditCopilot:
100
+ """Integrated Audit Copilot with multi-functionality."""
101
+
102
+ def __init__(self, openai_api_key: str = None):
103
+ self.openai_api_key = openai_api_key or os.getenv('OPENAI_API_KEY')
104
+ if not self.openai_api_key:
105
+ raise ValueError("OPENAI_API_KEY environment variable is not set")
106
+
107
+ self.embeddings = OpenAIEmbeddings(openai_api_key=self.openai_api_key)
108
+ self.vector_store = None
109
+ self.chain = None
110
+ self.chat_history = []
111
+ self.doc_processor = DocumentProcessor()
112
+
113
+ # Initialize LLM model - using GPT-3.5-turbo for all functionalities
114
+ self.llm = ChatOpenAI(
115
+ model_name="gpt-3.5-turbo",
116
+ temperature=0,
117
+ openai_api_key=self.openai_api_key
118
+ )
119
+
120
+ # Try to initialize with default document if available
121
+ try:
122
+ default_pdf = "IAASB-Drafting-Principles-Guidelines.pdf"
123
+ if os.path.exists(default_pdf):
124
+ with open(default_pdf, 'rb') as f:
125
+ self.process_documents([default_pdf])
126
+ print(f"Successfully initialized with {default_pdf}")
127
+ except Exception as e:
128
+ print(f"Note: Could not initialize with default document: {str(e)}")
129
+ # Continue initialization without failing
130
+
131
+ def process_documents(self, file_paths: List[str]) -> Dict[str, str]:
132
+ """Process documents and add to knowledge base."""
133
+ results = {}
134
+
135
+ for file_path in file_paths:
136
+ try:
137
+ with open(file_path, 'rb') as f:
138
+ content = f.read()
139
+
140
+ doc_type = Path(file_path).suffix
141
+ texts = self.doc_processor.process_document(content, doc_type)
142
+
143
+ if self.vector_store is None:
144
+ self.vector_store = FAISS.from_documents(texts, self.embeddings)
145
+ else:
146
+ self.vector_store.add_documents(texts)
147
+
148
+ # Initialize conversation chain whenever vector store is updated
149
+ self._initialize_conversation_chain()
150
+
151
+ results[file_path] = "Success"
152
+ except Exception as e:
153
+ results[file_path] = f"Error: {str(e)}"
154
+
155
+ return results
156
+
157
+ def _initialize_conversation_chain(self):
158
+ """Initialize or reinitialize the conversation chain."""
159
+ if self.vector_store is None:
160
+ return
161
+
162
+ memory = ConversationBufferMemory(
163
+ memory_key="chat_history",
164
+ return_messages=True
165
+ )
166
+
167
+ self.chain = ConversationalRetrievalChain.from_llm(
168
+ llm=self.llm,
169
+ retriever=self.vector_store.as_retriever(search_kwargs={"k": 4}),
170
+ memory=memory,
171
+ verbose=True
172
+ )
173
+
174
+ def get_response(self, question: str) -> str:
175
+ """Get conversational response from the chain."""
176
+ if not self.chain:
177
+ return "I don't have any documents to work with yet. Please upload audit documents first."
178
+
179
+ try:
180
+ if not question or not isinstance(question, str):
181
+ return "Please provide a valid question."
182
+
183
+ response = self.chain({"question": question})
184
+
185
+ if not response or 'answer' not in response:
186
+ return "I'm unable to generate a response. Please try again."
187
+
188
+ self.chat_history.append((question, response['answer']))
189
+ return response['answer']
190
+
191
+ except Exception as e:
192
+ error_msg = f"Error generating response: {str(e)}"
193
+ print(error_msg) # For logging
194
+ return error_msg
195
+
196
+ def get_compliance_response(self, query: str) -> Dict[str, Any]:
197
+ """Generate compliance-focused response to query."""
198
+ if not query.strip():
199
+ raise ValueError("Query cannot be empty")
200
+
201
+ if self.vector_store is None:
202
+ raise RuntimeError("No compliance documents have been processed yet")
203
+
204
+ # Create the retrieval chain
205
+ retriever = self.vector_store.as_retriever(search_kwargs={"k": 4})
206
+
207
+ # Create the compliance-focused prompt template
208
+ template = """You are Amy, an audit copilot and compliance expert. Answer the following question based on the provided context:
209
+
210
+ Context: {context}
211
+ Question: {question}
212
+
213
+ Provide a detailed answer that:
214
+ 1. Addresses compliance requirements and regulations
215
+ 2. Identifies potential risks and their severity
216
+ 3. Suggests mitigation strategies where applicable
217
+ 4. Cites specific sources and regulations
218
+
219
+ Response:"""
220
+
221
+ prompt = ChatPromptTemplate.from_template(template)
222
+
223
+ # Create the chain
224
+ chain = (
225
+ {
226
+ "context": retriever,
227
+ "question": RunnablePassthrough()
228
+ }
229
+ | prompt
230
+ | self.llm
231
+ | StrOutputParser()
232
+ )
233
+
234
+ # Get response
235
+ answer = chain.invoke(query)
236
+
237
+ # Get source documents
238
+ source_docs = retriever.invoke(query)
239
+
240
+ return {
241
+ "answer": answer,
242
+ "sources": self._format_sources(source_docs)
243
+ }
244
+
245
+ def generate_risk_assessment(self, file_path: str) -> Dict[str, Any]:
246
+ """Generate risk assessment for a specific document using GPT-3.5-turbo."""
247
+ try:
248
+ with open(file_path, 'rb') as f:
249
+ content = f.read()
250
+
251
+ texts = self.doc_processor.process_document(content, Path(file_path).suffix)
252
+
253
+ # Enhanced risk assessment prompt optimized for GPT-3.5-turbo
254
+ template = """You are Amy, an audit copilot specializing in risk assessment. Analyze the following audit document content and provide a comprehensive structured risk assessment:
255
+
256
+ Content: {content}
257
+
258
+ Provide a structured risk assessment with the following components:
259
+ 1. Executive Summary: Brief overview of the document and key findings (2-3 sentences)
260
+ 2. Key Risk Factors: Identify 3-5 specific risks with clear severity ratings (Low/Medium/High/Critical)
261
+ 3. Compliance Issues: List any specific compliance concerns with relevant regulatory references
262
+ 4. Recommended Actions: Provide actionable mitigation strategies with clear prioritization
263
+ 5. Implementation Timeline: Suggest realistic timeframes for addressing each risk area
264
+
265
+ Format your assessment with clear headers and bullet points where appropriate. Be specific, concise, and actionable.
266
+
267
+ Assessment:"""
268
+
269
+ prompt = ChatPromptTemplate.from_template(template)
270
+
271
+ # Process content in manageable chunks if too large
272
+ # Combine text content, limiting to approximately 8000 tokens
273
+ texts_content = [doc.page_content for doc in texts]
274
+ full_content = "\n".join(texts_content[:min(len(texts_content), 15)])
275
+
276
+ # Generate assessment
277
+ chain = prompt | self.llm | StrOutputParser()
278
+ assessment = chain.invoke({"content": full_content})
279
+
280
+ return {
281
+ "assessment": assessment,
282
+ "document": Path(file_path).name,
283
+ "timestamp": datetime.now().isoformat()
284
+ }
285
+ except Exception as e:
286
+ raise RuntimeError(f"Risk assessment failed: {str(e)}")
287
+
288
+ def _format_sources(self, source_documents: List[Document]) -> Set[str]:
289
+ """Format source references."""
290
+ return {Path(doc.metadata['source']).name for doc in source_documents}
291
+
292
+ def create_gradio_interface():
293
+ """Create Gradio interface for the integrated audit copilot."""
294
+ try:
295
+ # Get OpenAI API key
296
+ api_key = os.getenv("OPENAI_API_KEY")
297
+
298
+ # Initialize copilot
299
+ copilot = AuditCopilot(api_key)
300
+
301
+ with gr.Blocks(title="Amy - Your Audit Copilot") as demo:
302
+ gr.Markdown("# Amy - Your Audit Copilot")
303
+ gr.Markdown("I can help you with audit document analysis, compliance questions, and risk assessment.")
304
+
305
+ with gr.Tab("Document Processing"):
306
+ with gr.Row():
307
+ file_input = gr.File(
308
+ file_count="multiple",
309
+ label="Upload Audit Documents (PDF, DOCX, TXT)"
310
+ )
311
+ upload_button = gr.Button("Process Documents")
312
+ upload_output = gr.Textbox(label="Processing Status")
313
+
314
+ with gr.Tab("Conversation"):
315
+ # Chat section
316
+ chatbot = gr.Chatbot(label="Conversation with Amy")
317
+ msg = gr.Textbox(label="Ask me anything about your audit documents", placeholder="Type your question here...")
318
+ clear = gr.Button("Clear Chat")
319
+
320
+ with gr.Tab("Compliance Query"):
321
+ with gr.Row():
322
+ query_input = gr.Textbox(
323
+ lines=3,
324
+ label="Enter your compliance or regulatory query"
325
+ )
326
+ query_button = gr.Button("Submit Query")
327
+ query_output = gr.Textbox(
328
+ lines=10,
329
+ label="Amy's Response"
330
+ )
331
+
332
+ with gr.Tab("Risk Assessment"):
333
+ with gr.Row():
334
+ assessment_file = gr.File(
335
+ label="Select Document for Risk Assessment"
336
+ )
337
+ assess_button = gr.Button("Generate Risk Assessment")
338
+ assessment_output = gr.Textbox(
339
+ lines=15,
340
+ label="Risk Assessment Report"
341
+ )
342
+
343
+ # Set up event handlers
344
+ def handle_file_upload(files):
345
+ try:
346
+ if not files:
347
+ return "No files uploaded."
348
+
349
+ results = copilot.process_documents([f.name for f in files])
350
+
351
+ output_lines = []
352
+ for file_path, status in results.items():
353
+ file_name = Path(file_path).name
354
+ if status == "Success":
355
+ output_lines.append(f"✓ Successfully processed {file_name}")
356
+ else:
357
+ output_lines.append(f"❌ {file_name}: {status}")
358
+
359
+ return "\n".join(output_lines)
360
+ except Exception as e:
361
+ return f"Error: {str(e)}"
362
+
363
+ def respond(message, chat_history):
364
+ if not message.strip():
365
+ return "", chat_history
366
+ bot_message = copilot.get_response(message)
367
+ chat_history.append((message, bot_message))
368
+ return "", chat_history
369
+
370
+ def handle_compliance_query(query):
371
+ try:
372
+ result = copilot.get_compliance_response(query)
373
+ response = result["answer"]
374
+ if result["sources"]:
375
+ response += f"\n\nSources: {', '.join(result['sources'])}"
376
+ return response
377
+ except Exception as e:
378
+ return f"Error: {str(e)}"
379
+
380
+ def handle_risk_assessment(file):
381
+ try:
382
+ if not file:
383
+ return "No file selected for risk assessment."
384
+
385
+ result = copilot.generate_risk_assessment(file.name)
386
+ return f"Risk Assessment for {result['document']}\n\n{result['assessment']}"
387
+ except Exception as e:
388
+ return f"Error: {str(e)}"
389
+
390
+ # Connect event handlers
391
+ upload_button.click(
392
+ fn=handle_file_upload,
393
+ inputs=[file_input],
394
+ outputs=[upload_output]
395
+ )
396
+
397
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
398
+ clear.click(lambda: None, None, chatbot, queue=False)
399
+
400
+ query_button.click(
401
+ fn=handle_compliance_query,
402
+ inputs=[query_input],
403
+ outputs=[query_output]
404
+ )
405
+
406
+ assess_button.click(
407
+ fn=handle_risk_assessment,
408
+ inputs=[assessment_file],
409
+ outputs=[assessment_output]
410
+ )
411
+
412
+ return demo
413
+
414
+ except Exception as e:
415
+ print(f"Error creating interface: {str(e)}")
416
+ raise
417
+
418
+ if __name__ == "__main__":
419
+ try:
420
+ demo = create_gradio_interface()
421
+ demo.launch(share=True)
422
+ except Exception as e:
423
+ print(f"Error launching application: {str(e)}")