ngupta2026 commited on
Commit
3551d9b
Β·
verified Β·
1 Parent(s): ac6dc07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -101
app.py CHANGED
@@ -1,7 +1,7 @@
1
  # =====================================================
2
- # AI INSURANCE CLAIM GENERATOR (FINAL HIGH-ACCURACY VERSION)
3
- # Better TOTAL extraction + Better COMPANY extraction
4
- # Hugging Face Space Ready
5
  # =====================================================
6
 
7
  import gradio as gr
@@ -19,7 +19,7 @@ from transformers import LayoutLMTokenizerFast, LayoutLMForTokenClassification
19
  # =====================================================
20
  RESEND_API_KEY = os.getenv("RESEND_API_KEY")
21
 
22
- # VERIFIED DOMAIN EMAIL
23
  FROM_EMAIL = "AI Claims <claims@yudham.com>"
24
 
25
  MODEL_NAME = "ngupta2026/sroie-layoutlm"
@@ -45,7 +45,7 @@ model.to(device)
45
  model.eval()
46
 
47
  # =====================================================
48
- # NORMALIZE BOX
49
  # =====================================================
50
  def normalize(box, width, height):
51
  return [
@@ -55,82 +55,109 @@ def normalize(box, width, height):
55
  int(1000 * box[3] / height),
56
  ]
57
 
58
- # =====================================================
59
- # AVG CONF
60
- # =====================================================
61
- def avg_conf(lst):
62
  if len(lst) == 0:
63
- return 0
64
  return sum(lst) / len(lst)
65
 
66
  # =====================================================
67
- # CLEAN MONEY
68
  # =====================================================
69
- def clean_amount(txt):
70
- txt = txt.replace(",", "").replace("RM", "").replace("β‚Ή", "")
71
  txt = txt.strip()
72
 
73
- try:
74
- val = float(txt)
75
- return round(val, 2)
76
- except:
77
- return None
 
 
 
 
 
 
 
78
 
79
  # =====================================================
80
- # FIND BEST TOTAL (VERY IMPORTANT FIX)
81
  # =====================================================
82
- def extract_best_total(words):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  candidates = []
85
 
86
  for i, w in enumerate(words):
87
 
88
- # match amounts like:
89
- # 102.40
90
- # 1,234.50
91
- # RM102.40
92
- if re.fullmatch(r"(RM)?\d+[.,]?\d*\.\d{2}", w):
93
- amt = clean_amount(w)
94
- if amt:
95
- candidates.append(amt)
96
 
97
- elif re.fullmatch(r"\d+\.\d{2}", w):
98
- amt = clean_amount(w)
99
- if amt:
100
- candidates.append(amt)
101
 
102
- # choose sensible max under 100000
103
- candidates = [x for x in candidates if 1 <= x <= 100000]
104
 
105
- if len(candidates) == 0:
106
- return "Not Found"
 
107
 
108
- return f"{max(candidates):.2f}"
 
109
 
110
- # =====================================================
111
- # COMPANY CLEANER
112
- # =====================================================
113
- def clean_company(txt):
114
 
115
- txt = txt.strip()
 
116
 
117
- # remove garbage symbols
118
- txt = re.sub(r"[^A-Za-z0-9&().,\- ]", "", txt)
119
 
120
- # remove too short
121
- if len(txt) < 3:
122
- return "Not Found"
123
 
124
- return txt
125
 
126
  # =====================================================
127
- # OCR + EXTRACTION
128
  # =====================================================
129
  def extract_receipt(image):
130
 
131
  try:
132
  image = image.convert("RGB")
133
- image.thumbnail((1400, 1400))
134
 
135
  data = pytesseract.image_to_data(
136
  image,
@@ -142,16 +169,16 @@ def extract_receipt(image):
142
 
143
  for i in range(len(data["text"])):
144
 
145
- text = data["text"][i].strip()
146
 
147
- if text != "" and len(text) > 1:
148
 
149
  x = data["left"][i]
150
  y = data["top"][i]
151
  w = data["width"][i]
152
  h = data["height"][i]
153
 
154
- words.append(text)
155
  boxes.append([x, y, x + w, y + h])
156
 
157
  if len(words) == 0:
@@ -167,10 +194,10 @@ def extract_receipt(image):
167
  words,
168
  boxes=boxes,
169
  return_tensors="pt",
170
- padding="max_length",
171
  truncation=True,
172
- is_split_into_words=True,
173
- max_length=512
 
174
  )
175
 
176
  encoding = {k: v.to(device) for k, v in encoding.items()}
@@ -186,66 +213,61 @@ def extract_receipt(image):
186
  preds = torch.argmax(probs, dim=2)[0][:len(words)]
187
  confs = torch.max(probs, dim=2)[0][0][:len(words)]
188
 
189
- company_words = []
190
- company_conf = []
191
-
192
  # -------------------------------------------------
193
- # ENTITY EXTRACTION
194
  # -------------------------------------------------
 
 
 
195
  for word, pred, conf in zip(words, preds, confs):
196
 
197
  label = id2label[pred.item()]
198
- c = conf.item()
199
 
200
  if label == "COMPANY":
201
- company_words.append(word)
202
- company_conf.append(c)
203
-
204
- # -------------------------------------------------
205
- # COMPANY
206
- # -------------------------------------------------
207
- company = " ".join(company_words[:6]) if company_words else words[0]
208
- company = clean_company(company)
209
 
210
- # -------------------------------------------------
211
- # DATE
212
- # -------------------------------------------------
213
- date = "Not Found"
 
214
 
215
- for w in words:
216
- if re.search(r"\d{2}[/-]\d{2}[/-]\d{2,4}", w):
217
- date = w
218
- break
219
 
220
  # -------------------------------------------------
221
- # TOTAL (NEW LOGIC)
222
  # -------------------------------------------------
223
- total = extract_best_total(words)
 
224
 
225
  # -------------------------------------------------
226
  # CONFIDENCE
227
  # -------------------------------------------------
228
- conf = avg_conf(company_conf)
 
 
 
 
 
229
 
230
  if total != "Not Found":
231
- conf += 0.10
232
 
233
- conf = min(conf, 0.99)
234
 
235
- result = {
236
  "company": company,
237
  "date": date,
238
  "total": total,
239
- "confidence": round(conf, 3)
240
  }
241
 
242
- return result
243
-
244
  except Exception as e:
245
  return {"error": str(e)}
246
 
247
  # =====================================================
248
- # DECISION ENGINE
249
  # =====================================================
250
  def decision_layer(conf):
251
 
@@ -259,7 +281,7 @@ def decision_layer(conf):
259
  return "REJECT"
260
 
261
  # =====================================================
262
- # EMAIL SEND
263
  # =====================================================
264
  def send_claim_email(to_email, extracted):
265
 
@@ -268,18 +290,23 @@ def send_claim_email(to_email, extracted):
268
 
269
  subject = "Insurance Claim Request"
270
 
271
- html_body = f"""
272
  <h2>Insurance Claim Request</h2>
273
 
 
 
 
 
274
  <p><b>Provider:</b> {extracted['company']}</p>
275
- <p><b>Date:</b> {extracted['date']}</p>
276
- <p><b>Amount:</b> β‚Ή{extracted['total']}</p>
277
 
278
- <p>Regards,<br>AI Claims Bot</p>
 
279
  """
280
 
281
  try:
282
- response = requests.post(
283
  "https://api.resend.com/emails",
284
  headers={
285
  "Authorization": f"Bearer {RESEND_API_KEY}",
@@ -289,15 +316,15 @@ def send_claim_email(to_email, extracted):
289
  "from": FROM_EMAIL,
290
  "to": [to_email],
291
  "subject": subject,
292
- "html": html_body
293
  },
294
  timeout=20
295
  )
296
 
297
- if response.status_code in [200, 201]:
298
- return f"βœ… Email sent to {to_email}"
299
 
300
- return f"❌ Email failed: {response.text}"
301
 
302
  except Exception as e:
303
  return f"❌ Email error: {str(e)}"
@@ -313,7 +340,6 @@ def process_and_send(image, email_id):
313
  return extracted, extracted["error"]
314
 
315
  conf = extracted["confidence"]
316
-
317
  decision = decision_layer(conf)
318
 
319
  extracted["decision"] = decision
@@ -322,10 +348,10 @@ def process_and_send(image, email_id):
322
  email_status = send_claim_email(email_id, extracted)
323
 
324
  elif decision == "REVIEW":
325
- email_status = f"⚠️ Manual review required ({conf})"
326
 
327
  else:
328
- email_status = f"❌ Rejected ({conf})"
329
 
330
  return extracted, email_status
331
 
@@ -346,7 +372,7 @@ demo = gr.Interface(
346
  ],
347
 
348
  title="πŸ“„ AI Insurance Claim Generator",
349
- description="Upload receipt β†’ Better AI extraction β†’ Confidence β†’ Auto Email"
350
  )
351
 
352
  demo.launch()
 
1
  # =====================================================
2
+ # AI INSURANCE CLAIM GENERATOR (PRODUCTION FINAL VERSION)
3
+ # Accurate Company + Accurate Total + Email + Confidence
4
+ # HuggingFace Spaces Ready
5
  # =====================================================
6
 
7
  import gradio as gr
 
19
  # =====================================================
20
  RESEND_API_KEY = os.getenv("RESEND_API_KEY")
21
 
22
+ # Use your verified Resend sender email
23
  FROM_EMAIL = "AI Claims <claims@yudham.com>"
24
 
25
  MODEL_NAME = "ngupta2026/sroie-layoutlm"
 
45
  model.eval()
46
 
47
  # =====================================================
48
+ # HELPERS
49
  # =====================================================
50
  def normalize(box, width, height):
51
  return [
 
55
  int(1000 * box[3] / height),
56
  ]
57
 
58
+ def avg(lst):
 
 
 
59
  if len(lst) == 0:
60
+ return 0.0
61
  return sum(lst) / len(lst)
62
 
63
  # =====================================================
64
+ # COMPANY CLEANER
65
  # =====================================================
66
+ def clean_company(txt):
67
+
68
  txt = txt.strip()
69
 
70
+ txt = re.sub(r"[^A-Za-z0-9&().,\- /]", "", txt)
71
+ txt = re.sub(r"\s+", " ", txt).strip()
72
+
73
+ if len(txt) < 2:
74
+ return "Not Found"
75
+
76
+ letters = sum(c.isalpha() for c in txt)
77
+
78
+ if letters == 0:
79
+ return "Not Found"
80
+
81
+ return txt.upper()
82
 
83
  # =====================================================
84
+ # DATE EXTRACTION
85
  # =====================================================
86
+ def extract_date(words):
87
+
88
+ patterns = [
89
+ r"\d{2}[/-]\d{2}[/-]\d{2,4}",
90
+ r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"
91
+ ]
92
+
93
+ for w in words:
94
+ for p in patterns:
95
+ if re.fullmatch(p, w):
96
+ return w
97
+
98
+ return "Not Found"
99
+
100
+ # =====================================================
101
+ # TOTAL EXTRACTION (BEST PRACTICAL METHOD)
102
+ # =====================================================
103
+ def clean_amount_token(txt):
104
+
105
+ txt = txt.upper()
106
+ txt = txt.replace("RM", "")
107
+ txt = txt.replace("MYR", "")
108
+ txt = txt.replace("RS", "")
109
+ txt = txt.replace("β‚Ή", "")
110
+ txt = txt.replace(",", "")
111
+ txt = txt.strip()
112
+
113
+ return txt
114
+
115
+ def is_money(txt):
116
+ return re.fullmatch(r"\d+\.\d{2}", txt) is not None
117
+
118
+ def extract_total(words):
119
 
120
  candidates = []
121
 
122
  for i, w in enumerate(words):
123
 
124
+ x = clean_amount_token(w)
 
 
 
 
 
 
 
125
 
126
+ if is_money(x):
127
+ val = float(x)
 
 
128
 
129
+ if 0.50 <= val <= 100000:
130
+ candidates.append(val)
131
 
132
+ # choose realistic largest decimal value
133
+ if len(candidates) > 0:
134
+ return f"{max(candidates):.2f}"
135
 
136
+ # fallback integer values
137
+ int_candidates = []
138
 
139
+ for w in words:
140
+ x = clean_amount_token(w)
 
 
141
 
142
+ if re.fullmatch(r"\d+", x):
143
+ val = float(x)
144
 
145
+ if 1 <= val <= 100000:
146
+ int_candidates.append(val)
147
 
148
+ if len(int_candidates) > 0:
149
+ return f"{max(int_candidates):.2f}"
 
150
 
151
+ return "Not Found"
152
 
153
  # =====================================================
154
+ # OCR + MODEL EXTRACTION
155
  # =====================================================
156
  def extract_receipt(image):
157
 
158
  try:
159
  image = image.convert("RGB")
160
+ image.thumbnail((1500, 1500))
161
 
162
  data = pytesseract.image_to_data(
163
  image,
 
169
 
170
  for i in range(len(data["text"])):
171
 
172
+ txt = data["text"][i].strip()
173
 
174
+ if txt != "" and len(txt) > 1:
175
 
176
  x = data["left"][i]
177
  y = data["top"][i]
178
  w = data["width"][i]
179
  h = data["height"][i]
180
 
181
+ words.append(txt)
182
  boxes.append([x, y, x + w, y + h])
183
 
184
  if len(words) == 0:
 
194
  words,
195
  boxes=boxes,
196
  return_tensors="pt",
 
197
  truncation=True,
198
+ padding="max_length",
199
+ max_length=512,
200
+ is_split_into_words=True
201
  )
202
 
203
  encoding = {k: v.to(device) for k, v in encoding.items()}
 
213
  preds = torch.argmax(probs, dim=2)[0][:len(words)]
214
  confs = torch.max(probs, dim=2)[0][0][:len(words)]
215
 
 
 
 
216
  # -------------------------------------------------
217
+ # COMPANY FROM MODEL
218
  # -------------------------------------------------
219
+ company_tokens = []
220
+ company_scores = []
221
+
222
  for word, pred, conf in zip(words, preds, confs):
223
 
224
  label = id2label[pred.item()]
 
225
 
226
  if label == "COMPANY":
227
+ company_tokens.append(word)
228
+ company_scores.append(conf.item())
 
 
 
 
 
 
229
 
230
+ # fallback if model misses
231
+ if company_tokens:
232
+ company = " ".join(company_tokens[:8])
233
+ else:
234
+ company = " ".join(words[:5])
235
 
236
+ company = clean_company(company)
 
 
 
237
 
238
  # -------------------------------------------------
239
+ # DATE + TOTAL
240
  # -------------------------------------------------
241
+ date = extract_date(words)
242
+ total = extract_total(words)
243
 
244
  # -------------------------------------------------
245
  # CONFIDENCE
246
  # -------------------------------------------------
247
+ company_conf = avg(company_scores)
248
+
249
+ score = company_conf
250
+
251
+ if date != "Not Found":
252
+ score += 0.12
253
 
254
  if total != "Not Found":
255
+ score += 0.18
256
 
257
+ score = min(score, 0.99)
258
 
259
+ return {
260
  "company": company,
261
  "date": date,
262
  "total": total,
263
+ "confidence": round(score, 3)
264
  }
265
 
 
 
266
  except Exception as e:
267
  return {"error": str(e)}
268
 
269
  # =====================================================
270
+ # DECISION LAYER
271
  # =====================================================
272
  def decision_layer(conf):
273
 
 
281
  return "REJECT"
282
 
283
  # =====================================================
284
+ # EMAIL
285
  # =====================================================
286
  def send_claim_email(to_email, extracted):
287
 
 
290
 
291
  subject = "Insurance Claim Request"
292
 
293
+ html = f"""
294
  <h2>Insurance Claim Request</h2>
295
 
296
+ <p>Dear Team,</p>
297
+
298
+ <p>Please process the reimbursement claim.</p>
299
+
300
  <p><b>Provider:</b> {extracted['company']}</p>
301
+ <p><b>Bill Date:</b> {extracted['date']}</p>
302
+ <p><b>Claim Amount:</b> β‚Ή{extracted['total']}</p>
303
 
304
+ <br>
305
+ <p>Regards,<br>AI Claims System</p>
306
  """
307
 
308
  try:
309
+ r = requests.post(
310
  "https://api.resend.com/emails",
311
  headers={
312
  "Authorization": f"Bearer {RESEND_API_KEY}",
 
316
  "from": FROM_EMAIL,
317
  "to": [to_email],
318
  "subject": subject,
319
+ "html": html
320
  },
321
  timeout=20
322
  )
323
 
324
+ if r.status_code in [200, 201]:
325
+ return f"βœ… Email sent successfully to {to_email}"
326
 
327
+ return f"❌ Email failed: {r.text}"
328
 
329
  except Exception as e:
330
  return f"❌ Email error: {str(e)}"
 
340
  return extracted, extracted["error"]
341
 
342
  conf = extracted["confidence"]
 
343
  decision = decision_layer(conf)
344
 
345
  extracted["decision"] = decision
 
348
  email_status = send_claim_email(email_id, extracted)
349
 
350
  elif decision == "REVIEW":
351
+ email_status = f"⚠️ Human review required (confidence={conf})"
352
 
353
  else:
354
+ email_status = f"❌ Rejected (confidence={conf})"
355
 
356
  return extracted, email_status
357
 
 
372
  ],
373
 
374
  title="πŸ“„ AI Insurance Claim Generator",
375
+ description="Upload receipt β†’ Extract fields β†’ Confidence β†’ Auto Email"
376
  )
377
 
378
  demo.launch()