prernajeet01 commited on
Commit
95fb8ff
·
verified ·
1 Parent(s): 251a3a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +346 -0
app.py CHANGED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["OPENAI_API_KEY"] = "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"
3
+
4
+ from fastapi import FastAPI
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ import gradio as gr
7
+ import os
8
+ import tempfile
9
+ import warnings
10
+ from pathlib import Path
11
+ from typing import List, Dict, Any, Set, Union
12
+ from datetime import datetime
13
+ import pytesseract
14
+ from pdf2image import convert_from_path
15
+ import numpy as np
16
+
17
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
18
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
19
+ from langchain_core.output_parsers import StrOutputParser
20
+ from langchain_core.prompts import ChatPromptTemplate
21
+ from langchain_core.runnables import RunnablePassthrough
22
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
23
+ from langchain_community.vectorstores import FAISS
24
+ from langchain_core.documents import Document
25
+
26
+ # Suppress warnings
27
+ warnings.filterwarnings("ignore", category=FutureWarning)
28
+
29
+ class RiskLevel:
30
+ LOW = "Low"
31
+ MEDIUM = "Medium"
32
+ HIGH = "High"
33
+ CRITICAL = "Critical"
34
+
35
+ class DocumentProcessor:
36
+ """Enhanced document processing with OCR support."""
37
+
38
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
39
+ self.chunk_size = chunk_size
40
+ self.chunk_overlap = chunk_overlap
41
+ self.text_splitter = RecursiveCharacterTextSplitter(
42
+ chunk_size=chunk_size,
43
+ chunk_overlap=chunk_overlap
44
+ )
45
+
46
+ def process_document(self, content: bytes, doc_type: str) -> List[Document]:
47
+ """Process document content based on type."""
48
+ with tempfile.NamedTemporaryFile(delete=False, suffix=doc_type) as temp_file:
49
+ temp_file.write(content)
50
+ temp_file_path = temp_file.name
51
+
52
+ try:
53
+ documents = self.load_document(temp_file_path)
54
+ return self.split_documents(documents)
55
+ finally:
56
+ os.unlink(temp_file_path)
57
+
58
+ def load_document(self, file_path: Union[str, Path]) -> List[Document]:
59
+ """Load document using appropriate loader with OCR support."""
60
+ file_path = Path(file_path)
61
+ suffix = file_path.suffix.lower()
62
+
63
+ if suffix == '.pdf':
64
+ # Try normal PDF loading first
65
+ try:
66
+ loader = PyPDFLoader(str(file_path))
67
+ documents = loader.load()
68
+ if not any(doc.page_content.strip() for doc in documents):
69
+ raise ValueError("No text content found")
70
+ return documents
71
+ except:
72
+ # If normal loading fails, try OCR
73
+ return self._process_pdf_with_ocr(file_path)
74
+ elif suffix == '.docx':
75
+ loader = Docx2txtLoader(str(file_path))
76
+ return loader.load()
77
+ elif suffix == '.txt':
78
+ loader = TextLoader(str(file_path))
79
+ return loader.load()
80
+ else:
81
+ raise ValueError(f"Unsupported file type: {suffix}")
82
+
83
+ def _process_pdf_with_ocr(self, file_path: Path) -> List[Document]:
84
+ """Process PDF with OCR using Tesseract."""
85
+ documents = []
86
+ images = convert_from_path(str(file_path))
87
+
88
+ for i, image in enumerate(images):
89
+ text = pytesseract.image_to_string(image)
90
+ if text.strip():
91
+ documents.append(Document(
92
+ page_content=text,
93
+ metadata={"source": str(file_path), "page": i + 1}
94
+ ))
95
+
96
+ return documents
97
+
98
+ def split_documents(self, documents: List[Document]) -> List[Document]:
99
+ """Split documents into chunks."""
100
+ return self.text_splitter.split_documents(documents)
101
+
102
+ class ComplianceAssistant:
103
+ """Compliance and Audit Assistant with risk assessment capabilities."""
104
+
105
+ def __init__(self, openai_api_key: str):
106
+ self.openai_api_key = openai_api_key
107
+ self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
108
+ self.vector_store = None
109
+ self.doc_processor = DocumentProcessor()
110
+ self.llm = ChatOpenAI(
111
+ temperature=0,
112
+ model_name="gpt-4",
113
+ openai_api_key=openai_api_key
114
+ )
115
+
116
+ def process_documents(self, file_paths: List[str]) -> Dict[str, str]:
117
+ """Process documents and add to knowledge base."""
118
+ results = {}
119
+
120
+ for file_path in file_paths:
121
+ try:
122
+ with open(file_path, 'rb') as f:
123
+ content = f.read()
124
+
125
+ doc_type = Path(file_path).suffix
126
+ texts = self.doc_processor.process_document(content, doc_type)
127
+
128
+ if self.vector_store is None:
129
+ self.vector_store = FAISS.from_documents(texts, self.embeddings)
130
+ else:
131
+ self.vector_store.add_documents(texts)
132
+
133
+ results[file_path] = "Success"
134
+ except Exception as e:
135
+ results[file_path] = f"Error: {str(e)}"
136
+
137
+ return results
138
+
139
+ def get_compliance_response(self, query: str) -> Dict[str, Any]:
140
+ """Generate compliance-focused response to query."""
141
+ if not query.strip():
142
+ raise ValueError("Query cannot be empty")
143
+
144
+ if self.vector_store is None:
145
+ raise RuntimeError("No compliance documents have been processed yet")
146
+
147
+ # Create the retrieval chain
148
+ retriever = self.vector_store.as_retriever(search_kwargs={"k": 4})
149
+
150
+ # Create the compliance-focused prompt template
151
+ template = """You are a compliance and audit expert. Answer the following question based on the provided context:
152
+
153
+ Context: {context}
154
+ Question: {question}
155
+
156
+ Provide a detailed answer that:
157
+ 1. Addresses compliance requirements and regulations
158
+ 2. Identifies potential risks and their severity
159
+ 3. Suggests mitigation strategies where applicable
160
+ 4. Cites specific sources and regulations
161
+
162
+ Response:"""
163
+
164
+ prompt = ChatPromptTemplate.from_template(template)
165
+
166
+ # Create the chain
167
+ chain = (
168
+ {
169
+ "context": retriever,
170
+ "question": RunnablePassthrough()
171
+ }
172
+ | prompt
173
+ | self.llm
174
+ | StrOutputParser()
175
+ )
176
+
177
+ # Get response
178
+ answer = chain.invoke(query)
179
+
180
+ # Get source documents using the new invoke method
181
+ source_docs = retriever.invoke(query)
182
+
183
+ return {
184
+ "answer": answer,
185
+ "sources": self._format_sources(source_docs)
186
+ }
187
+
188
+ def generate_risk_assessment(self, document_path: str) -> Dict[str, Any]:
189
+ """Generate risk assessment for a specific document."""
190
+ try:
191
+ with open(document_path, 'rb') as f:
192
+ content = f.read()
193
+
194
+ texts = self.doc_processor.process_document(content, Path(document_path).suffix)
195
+
196
+ # Create risk assessment prompt
197
+ template = """Analyze the following audit document content and provide a structured risk assessment:
198
+
199
+ Content: {content}
200
+
201
+ Provide:
202
+ 1. Executive Summary
203
+ 2. Key Risk Factors (with severity ratings)
204
+ 3. Compliance Issues
205
+ 4. Recommended Actions
206
+ 5. Timeline for Remediation
207
+
208
+ Assessment:"""
209
+
210
+ prompt = ChatPromptTemplate.from_template(template)
211
+
212
+ # Combine all text content
213
+ full_content = "\n".join([doc.page_content for doc in texts])
214
+
215
+ # Generate assessment
216
+ chain = prompt | self.llm | StrOutputParser()
217
+ assessment = chain.invoke({"content": full_content})
218
+
219
+ return {
220
+ "assessment": assessment,
221
+ "document": Path(document_path).name,
222
+ "timestamp": datetime.now().isoformat()
223
+ }
224
+ except Exception as e:
225
+ raise RuntimeError(f"Risk assessment failed: {str(e)}")
226
+
227
+ def _format_sources(self, source_documents: List[Document]) -> Set[str]:
228
+ """Format source references."""
229
+ return {Path(doc.metadata['source']).name for doc in source_documents}
230
+
231
+ def create_gradio_interface(assistant: ComplianceAssistant) -> gr.Blocks:
232
+ """Create Gradio interface for compliance assistant."""
233
+
234
+ def handle_file_upload(files: List[tempfile._TemporaryFileWrapper]) -> str:
235
+ try:
236
+ if not files:
237
+ return "No files uploaded."
238
+
239
+ results = assistant.process_documents([f.name for f in files])
240
+
241
+ output_lines = []
242
+ for file_path, status in results.items():
243
+ file_name = Path(file_path).name
244
+ if status == "Success":
245
+ output_lines.append(f"✓ Successfully processed {file_name}")
246
+ else:
247
+ output_lines.append(f"❌ {file_name}: {status}")
248
+
249
+ return "\n".join(output_lines)
250
+ except Exception as e:
251
+ return f"Error: {str(e)}"
252
+
253
+ def handle_compliance_query(query: str) -> str:
254
+ try:
255
+ result = assistant.get_compliance_response(query)
256
+ response = result["answer"]
257
+ if result["sources"]:
258
+ response += f"\n\nSources: {', '.join(result['sources'])}"
259
+ return response
260
+ except Exception as e:
261
+ return f"Error: {str(e)}"
262
+
263
+ def handle_risk_assessment(file: tempfile._TemporaryFileWrapper) -> str:
264
+ try:
265
+ if not file:
266
+ return "No file selected for risk assessment."
267
+
268
+ result = assistant.generate_risk_assessment(file.name)
269
+ return f"Risk Assessment for {result['document']}\n\n{result['assessment']}"
270
+ except Exception as e:
271
+ return f"Error: {str(e)}"
272
+
273
+ # Create interface
274
+ with gr.Blocks(title="Compliance and Audit Assistant") as interface:
275
+ gr.Markdown("# Compliance and Audit Assistant")
276
+
277
+ with gr.Tab("Document Processing"):
278
+ with gr.Row():
279
+ file_input = gr.File(
280
+ file_count="multiple",
281
+ label="Upload Compliance Documents (PDF, DOCX, TXT)"
282
+ )
283
+ upload_button = gr.Button("Process Documents")
284
+ upload_output = gr.Textbox(label="Processing Status")
285
+
286
+ with gr.Tab("Compliance Query"):
287
+ with gr.Row():
288
+ query_input = gr.Textbox(
289
+ lines=3,
290
+ label="Enter your compliance or regulatory query"
291
+ )
292
+ query_button = gr.Button("Submit Query")
293
+ query_output = gr.Textbox(
294
+ lines=10,
295
+ label="Assistant Response"
296
+ )
297
+
298
+ with gr.Tab("Risk Assessment"):
299
+ with gr.Row():
300
+ assessment_file = gr.File(
301
+ label="Select Document for Risk Assessment"
302
+ )
303
+ assess_button = gr.Button("Generate Risk Assessment")
304
+ assessment_output = gr.Textbox(
305
+ lines=15,
306
+ label="Risk Assessment Report"
307
+ )
308
+
309
+ # Set up event handlers
310
+ upload_button.click(
311
+ fn=handle_file_upload,
312
+ inputs=[file_input],
313
+ outputs=[upload_output]
314
+ )
315
+
316
+ query_button.click(
317
+ fn=handle_compliance_query,
318
+ inputs=[query_input],
319
+ outputs=[query_output]
320
+ )
321
+
322
+ assess_button.click(
323
+ fn=handle_risk_assessment,
324
+ inputs=[assessment_file],
325
+ outputs=[assessment_output]
326
+ )
327
+
328
+ return interface
329
+
330
+ def main():
331
+ """Main function to run the compliance assistant."""
332
+ # Get OpenAI API key
333
+ api_key = os.getenv("OPENAI_API_KEY")
334
+ if not api_key:
335
+ api_key = input("Please enter your OpenAI API key: ")
336
+ os.environ["OPENAI_API_KEY"] = api_key
337
+
338
+ # Initialize assistant
339
+ assistant = ComplianceAssistant(api_key)
340
+
341
+ # Launch interface
342
+ interface = create_gradio_interface(assistant)
343
+ interface.launch(share=True, debug=True)
344
+
345
+ if __name__ == "__main__":
346
+ main()