nasreshsuguru commited on
Commit
bed2e99
·
verified ·
1 Parent(s): 16d2840

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -102
app.py CHANGED
@@ -1,105 +1,255 @@
1
- from flask import Flask, request, render_template_string
2
- from transformers import AutoTokenizer, AutoModelForTokenClassification
3
- from transformers import pipeline
4
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- app = Flask(__name__)
7
-
8
- # Load NER model and tokenizer (allow online download or use cached if available)
9
- tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER", local_files_only=False)
10
- model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", local_files_only=False)
11
- ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
12
-
13
- # HTML template with table output
14
- html_template = """
15
- <!DOCTYPE html>
16
- <html>
17
- <head>
18
- <title>Vendor Document Scorer</title>
19
- <style>
20
- body { font-family: Arial, sans-serif; margin: 20px; background-color: #f9f9f9; }
21
- h2 { color: #333; }
22
- .upload-form { margin-bottom: 20px; }
23
- table {
24
- width: 100%;
25
- border-collapse: collapse;
26
- margin-top: 20px;
27
- background-color: white;
28
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
29
- }
30
- th, td {
31
- padding: 12px;
32
- text-align: left;
33
- border-bottom: 1px solid #ddd;
34
- }
35
- th {
36
- background-color: #4a90e2;
37
- color: white;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  }
39
- tr:nth-child(even) { background-color: #f2f2f2; }
40
- tr:hover { background-color: #ddd; }
41
- </style>
42
- </head>
43
- <body>
44
- <h2>Upload Vendor Document</h2>
45
- <div class="upload-form">
46
- <form method="post" enctype="multipart/form-data">
47
- <input type="file" name="file" accept=".txt,.pdf">
48
- <input type="submit" value="Upload and Score">
49
- </form>
50
- </div>
51
- {% if results %}
52
- <table>
53
- <tr>
54
- <th>Entity Type</th>
55
- <th>Text</th>
56
- <th>Confidence (%)</th>
57
- </tr>
58
- {% for entity in results %}
59
- <tr>
60
- <td>{{ entity['entity'] }}</td>
61
- <td>{{ entity['word'] }}</td>
62
- <td>{{ '%.2f'|format(entity['score']*100) }}</td>
63
- </tr>
64
- {% endfor %}
65
- </table>
66
- {% endif %}
67
- </body>
68
- </html>
69
- """
70
-
71
- @app.route('/', methods=['GET', 'POST'])
72
- def upload_file():
73
- results = []
74
- if request.method == 'POST':
75
- if 'file' not in request.files:
76
- return render_template_string(html_template, results=results)
77
- file = request.files['file']
78
- if file.filename == '':
79
- return render_template_string(html_template, results=results)
80
- if file:
81
- content = file.read().decode('utf-8')
82
- # Extract entities with NER
83
- ner_results = ner_pipeline(content)
84
- # Include all entity types (PERSON, ORG, etc.) and add date extraction
85
- for result in ner_results:
86
- entity_type = result['entity'].replace('B-', '').replace('I-', '')
87
- if entity_type in ['PERSON', 'ORG', 'LOC']:
88
- results.append({
89
- 'entity': entity_type,
90
- 'word': result['word'],
91
- 'score': result['score']
92
- })
93
- # Add basic date extraction as an example
94
- dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', content)
95
- for date in dates:
96
- results.append({
97
- 'entity': 'DATE',
98
- 'word': date,
99
- 'score': 1.0 # Assign high confidence for regex match
100
- })
101
- return render_template_string(html_template, results=results)
102
- return render_template_string(html_template, results=results)
103
-
104
- if __name__ == '__main__':
105
- app.run(debug=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
 
 
2
  import re
3
+ import json
4
+ import tempfile
5
+ import gradio as gr
6
+ from paddleocr import PaddleOCR
7
+ import fitz # PyMuPDF
8
+ from simple_salesforce import Salesforce
9
+ from dotenv import load_dotenv
10
+ import logging
11
+ from fastapi import FastAPI, UploadFile, File
12
+ from fastapi.responses import JSONResponse
13
+ import time
14
+ import base64
15
+ from reportlab.lib.pagesizes import letter
16
+ from reportlab.pdfgen import canvas
17
+ from io import BytesIO
18
 
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+ load_dotenv()
22
+
23
+ SF_USERNAME = os.getenv('SF_USERNAME')
24
+ SF_PASSWORD = os.getenv('SF_PASSWORD')
25
+ SF_SECURITY_TOKEN = os.getenv('SF_SECURITY_TOKEN')
26
+
27
+ # Initialize PaddleOCR with better parameters
28
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False, det_limit_side_len=2000)
29
+
30
+ required_values = [
31
+ "Vendor Name",
32
+ "Tax Identification Number (TIN)",
33
+ "Address",
34
+ "Certification Details",
35
+ "Contract Terms",
36
+ "Payment Terms",
37
+ "Signature"
38
+ ]
39
+
40
+ VALID_FLAGS = ['Valid', 'Incomplete', 'Missing', 'Invalid']
41
+
42
+ app = FastAPI()
43
+
44
+ def generate_pdf_from_text(text, vendor_name):
45
+ try:
46
+ pdf_buffer = BytesIO()
47
+ c = canvas.Canvas(pdf_buffer, pagesize=letter)
48
+ width, height = letter
49
+ text_object = c.beginText(40, height - 40)
50
+ for line in text.split('\n'):
51
+ text_object.textLine(line)
52
+ c.drawText(text_object)
53
+ c.showPage()
54
+ c.save()
55
+ pdf_buffer.seek(0)
56
+ return pdf_buffer
57
+ except Exception as e:
58
+ logger.error(f"Error generating PDF: {e}")
59
+ return None
60
+
61
+ def upload_pdf_to_salesforce(sf, pdf_buffer, vendor_name):
62
+ try:
63
+ encoded_pdf = base64.b64encode(pdf_buffer.getvalue()).decode('utf-8')
64
+ timestamp = int(time.time())
65
+ file_name = f"{vendor_name}_ExtractedText_{timestamp}.pdf"
66
+ content_version_data = {
67
+ "Title": file_name,
68
+ "PathOnClient": file_name,
69
+ "VersionData": encoded_pdf
70
  }
71
+ content_version = sf.ContentVersion.create(content_version_data)
72
+ file_url = f"https://{sf.sf_instance}/sfc/servlet.shepherd/version/download/{content_version['id']}"
73
+ return file_url
74
+ except Exception as e:
75
+ logger.error(f"Error uploading PDF to Salesforce: {e}")
76
+ return None
77
+
78
+ # Updated Vendor Name Extraction Logic with better handling
79
+ def extract_vendor_name(text):
80
+ print("\n=== OCR Extracted Text Start ===")
81
+ print(text)
82
+ print("=== OCR Extracted Text End ===\n")
83
+
84
+ if not text or text.isspace():
85
+ logger.warning("Extracted text is empty or whitespace.")
86
+ return "Unknown Vendor"
87
+
88
+ # Try regex for "Vendor Name: ..." or similar patterns
89
+ match = re.search(r"(?i)vendor\s*name\s*[:\-]?\s*(.+?)(?:\n|$)", text)
90
+ if match:
91
+ vendor_name = match.group(1).strip()
92
+ if vendor_name:
93
+ return vendor_name
94
+
95
+ # Fallback: Look for any line that might contain a vendor name
96
+ for line in text.splitlines():
97
+ line = line.strip()
98
+ if "vendor" in line.lower() and len(line.split()) <= 5 and len(line) > 3:
99
+ return line
100
+
101
+ logger.warning("Could not extract a valid vendor name from the text.")
102
+ return "Unknown Vendor"
103
+
104
+ def analyze_document(document_text):
105
+ missing = []
106
+ for value in required_values:
107
+ if value.lower() not in document_text.lower():
108
+ missing.append(value)
109
+ return missing
110
+
111
+ def insert_into_salesforce(vendor_name, extracted_text, category, score, comments, flags):
112
+ try:
113
+ sf = Salesforce(username=SF_USERNAME, password=SF_PASSWORD, security_token=SF_SECURITY_TOKEN)
114
+ vendor_name_clean = vendor_name.strip()
115
+
116
+ # Check if vendor_name_clean is empty or invalid
117
+ if not vendor_name_clean or vendor_name_clean.lower() == "unknown vendor":
118
+ logger.warning("Vendor name is invalid or empty. Skipping Salesforce query.")
119
+ return "Error: Invalid vendor name"
120
+
121
+ # Escape single quotes in vendor_name_clean to prevent SOQL injection
122
+ vendor_name_clean = vendor_name_clean.replace("'", "\\'")
123
+ vendor_record = sf.query(f"SELECT Id FROM Vendor__c WHERE Name = '{vendor_name_clean}' LIMIT 1")
124
+
125
+ if vendor_record['totalSize'] == 0:
126
+ logger.warning(f"Vendor '{vendor_name_clean}' not found in Vendor__c object!")
127
+ vendor_id = None
128
+ else:
129
+ vendor_id = vendor_record['records'][0]['Id']
130
+ logger.info(f"Vendor found with ID: {vendor_id}")
131
+
132
+ pdf_buffer = generate_pdf_from_text(extracted_text, vendor_name_clean)
133
+ pdf_url = upload_pdf_to_salesforce(sf, pdf_buffer, vendor_name_clean) if pdf_buffer else None
134
+
135
+ result = sf.Vendor_Scorecard__c.create({
136
+ 'Vendor_Name__c': vendor_name_clean,
137
+ 'Extracted_Text_URL__c': pdf_url or "",
138
+ 'Score__c': score,
139
+ 'Category_Match__c': category,
140
+ 'Comments__c': comments,
141
+ 'Flags__c': flags
142
+ })
143
+
144
+ logger.info(f"Record inserted successfully with ID: {result.get('id')}")
145
+ return result
146
+ except Exception as e:
147
+ logger.error(f"Error inserting into Salesforce: {e}")
148
+ return f"Error: {e}"
149
+
150
+ def process_pdf(pdf_file):
151
+ start_time = time.time()
152
+ try:
153
+ if not pdf_file:
154
+ return "No file uploaded", "Error", 0, "Error", "Error"
155
+
156
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
157
+ temp_file.write(open(pdf_file.name, 'rb').read())
158
+ temp_file_path = temp_file.name
159
+
160
+ # Open PDF with PyMuPDF
161
+ pdf_doc = fitz.open(temp_file_path)
162
+ extracted_text = ""
163
+
164
+ for page in pdf_doc:
165
+ try:
166
+ # Increase resolution for better OCR accuracy
167
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
168
+ page_path = tempfile.mktemp(suffix=".png")
169
+ pix.save(page_path)
170
+
171
+ # Run OCR on the image
172
+ result = ocr.ocr(page_path)
173
+ if result and result[0]:
174
+ page_text = "\n".join([line[1][0] for line in result[0]])
175
+ extracted_text += page_text + "\n"
176
+ else:
177
+ logger.warning(f"No text extracted from page {page.number}.")
178
+ except Exception as e:
179
+ logger.error(f"Error processing page {page.number}: {e}")
180
+ continue
181
+ finally:
182
+ # Clean up temporary image file
183
+ if os.path.exists(page_path):
184
+ os.remove(page_path)
185
+
186
+ # Clean up temporary PDF file
187
+ os.remove(temp_file_path)
188
+
189
+ if not extracted_text.strip():
190
+ logger.error("No text extracted from the PDF.")
191
+ return "Error: No text extracted", "Error", 0, "Error", "Error"
192
+
193
+ vendor_name = extract_vendor_name(extracted_text)
194
+ missing = analyze_document(extracted_text)
195
+ missing_count = len(missing)
196
+
197
+ if missing_count == 0:
198
+ category, score, comments, flags = 'Compliant', 100, 'All values present.', 'Valid'
199
+ elif missing_count == 1:
200
+ category, score, comments, flags = 'Partially Compliant', 85, 'One value missing.', 'Incomplete'
201
+ elif 1 < missing_count < 3:
202
+ category, score, comments, flags = 'Non-Compliant', 60, 'Two values missing.', 'Missing'
203
+ else:
204
+ category, score, comments, flags = 'Not Applicable', 40, 'Three or more values missing.', 'Invalid'
205
+
206
+ insert_result = insert_into_salesforce(vendor_name, extracted_text, category, score, comments, flags)
207
+ duration = time.time() - start_time
208
+ logger.info(f"Processing time: {duration:.2f} seconds")
209
+ return extracted_text, category, score, comments, flags
210
+
211
+ except Exception as e:
212
+ logger.error(f"Error processing PDF: {e}")
213
+ return f"Error: {e}", "Error", 0, "Error", "Error"
214
+
215
+ @app.post("/process_pdf/")
216
+ async def process_pdf_api(file: UploadFile = File(...)):
217
+ try:
218
+ contents = await file.read()
219
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
220
+ temp_file.write(contents)
221
+ extracted_text, category, score, comments, flags = process_pdf(temp_file)
222
+ return JSONResponse(content={
223
+ "extracted_text": extracted_text,
224
+ "category": category,
225
+ "score": score,
226
+ "comments": comments,
227
+ "flags": flags
228
+ })
229
+ except Exception as e:
230
+ logger.error(f"Error processing the file via API: {e}")
231
+ return JSONResponse(content={"error": str(e)}, status_code=500)
232
+
233
+ def gradio_interface(pdf_file):
234
+ return process_pdf(pdf_file)
235
+
236
+ gr_interface = gr.Interface(
237
+ fn=gradio_interface,
238
+ inputs=gr.File(label="Upload PDF Document"),
239
+ outputs=[
240
+ gr.Textbox(label="Extracted Text"),
241
+ gr.Textbox(label="Category Match"),
242
+ gr.Number(label="Score"),
243
+ gr.Textbox(label="Comments"),
244
+ gr.Textbox(label="Flags")
245
+ ],
246
+ live=True
247
+ )
248
+
249
+ if __name__ == "__main__":
250
+ import threading
251
+ def run_gradio():
252
+ gr_interface.launch()
253
+ threading.Thread(target=run_gradio).start()
254
+ import uvicorn
255
+ uvicorn.run(app, host="0.0.0.0", port=8000)