Spaces:

ngupta2026
/

Gen_AI_Project

Sleeping

App Files Files Community

ngupta2026 commited on Apr 29

Commit

3551d9b

verified ·

1 Parent(s): ac6dc07

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -101

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # =====================================================
-# AI INSURANCE CLAIM GENERATOR (FINAL HIGH-ACCURACY VERSION)
-# Better TOTAL extraction + Better COMPANY extraction
-# Hugging Face Space Ready
 # =====================================================
 import gradio as gr
@@ -19,7 +19,7 @@ from transformers import LayoutLMTokenizerFast, LayoutLMForTokenClassification
 # =====================================================
 RESEND_API_KEY = os.getenv("RESEND_API_KEY")
-# VERIFIED DOMAIN EMAIL
 FROM_EMAIL = "AI Claims <claims@yudham.com>"
 MODEL_NAME = "ngupta2026/sroie-layoutlm"
@@ -45,7 +45,7 @@ model.to(device)
 model.eval()
 # =====================================================
-# NORMALIZE BOX
 # =====================================================
 def normalize(box, width, height):
     return [
@@ -55,82 +55,109 @@ def normalize(box, width, height):
         int(1000 * box[3] / height),
     ]
-# =====================================================
-# AVG CONF
-# =====================================================
-def avg_conf(lst):
     if len(lst) == 0:
-        return 0
     return sum(lst) / len(lst)
 # =====================================================
-# CLEAN MONEY
 # =====================================================
-def clean_amount(txt):
-    txt = txt.replace(",", "").replace("RM", "").replace("₹", "")
     txt = txt.strip()
-    try:
-        val = float(txt)
-        return round(val, 2)
-    except:
-        return None
 # =====================================================
-# FIND BEST TOTAL (VERY IMPORTANT FIX)
 # =====================================================
-def extract_best_total(words):
     candidates = []
     for i, w in enumerate(words):
-        # match amounts like:
-        # 102.40
-        # 1,234.50
-        # RM102.40
-        if re.fullmatch(r"(RM)?\d+[.,]?\d*\.\d{2}", w):
-            amt = clean_amount(w)
-            if amt:
-                candidates.append(amt)
-        elif re.fullmatch(r"\d+\.\d{2}", w):
-            amt = clean_amount(w)
-            if amt:
-                candidates.append(amt)
-    # choose sensible max under 100000
-    candidates = [x for x in candidates if 1 <= x <= 100000]
-    if len(candidates) == 0:
-        return "Not Found"
-    return f"{max(candidates):.2f}"
-# =====================================================
-# COMPANY CLEANER
-# =====================================================
-def clean_company(txt):
-    txt = txt.strip()
-    # remove garbage symbols
-    txt = re.sub(r"[^A-Za-z0-9&().,\- ]", "", txt)
-    # remove too short
-    if len(txt) < 3:
-        return "Not Found"
-    return txt
 # =====================================================
-# OCR + EXTRACTION
 # =====================================================
 def extract_receipt(image):
     try:
         image = image.convert("RGB")
-        image.thumbnail((1400, 1400))
         data = pytesseract.image_to_data(
             image,
@@ -142,16 +169,16 @@ def extract_receipt(image):
         for i in range(len(data["text"])):
-            text = data["text"][i].strip()
-            if text != "" and len(text) > 1:
                 x = data["left"][i]
                 y = data["top"][i]
                 w = data["width"][i]
                 h = data["height"][i]
-                words.append(text)
                 boxes.append([x, y, x + w, y + h])
         if len(words) == 0:
@@ -167,10 +194,10 @@ def extract_receipt(image):
             words,
             boxes=boxes,
             return_tensors="pt",
-            padding="max_length",
             truncation=True,
-            is_split_into_words=True,
-            max_length=512
         )
         encoding = {k: v.to(device) for k, v in encoding.items()}
@@ -186,66 +213,61 @@ def extract_receipt(image):
         preds = torch.argmax(probs, dim=2)[0][:len(words)]
         confs = torch.max(probs, dim=2)[0][0][:len(words)]
-        company_words = []
-        company_conf = []
         # -------------------------------------------------
-        # ENTITY EXTRACTION
         # -------------------------------------------------
         for word, pred, conf in zip(words, preds, confs):
             label = id2label[pred.item()]
-            c = conf.item()
             if label == "COMPANY":
-                company_words.append(word)
-                company_conf.append(c)
-        # -------------------------------------------------
-        # COMPANY
-        # -------------------------------------------------
-        company = " ".join(company_words[:6]) if company_words else words[0]
-        company = clean_company(company)
-        # -------------------------------------------------
-        # DATE
-        # -------------------------------------------------
-        date = "Not Found"
-        for w in words:
-            if re.search(r"\d{2}[/-]\d{2}[/-]\d{2,4}", w):
-                date = w
-                break
         # -------------------------------------------------
-        # TOTAL (NEW LOGIC)
         # -------------------------------------------------
-        total = extract_best_total(words)
         # -------------------------------------------------
         # CONFIDENCE
         # -------------------------------------------------
-        conf = avg_conf(company_conf)
         if total != "Not Found":
-            conf += 0.10
-        conf = min(conf, 0.99)
-        result = {
             "company": company,
             "date": date,
             "total": total,
-            "confidence": round(conf, 3)
         }
-        return result
     except Exception as e:
         return {"error": str(e)}
 # =====================================================
-# DECISION ENGINE
 # =====================================================
 def decision_layer(conf):
@@ -259,7 +281,7 @@ def decision_layer(conf):
         return "REJECT"
 # =====================================================
-# EMAIL SEND
 # =====================================================
 def send_claim_email(to_email, extracted):
@@ -268,18 +290,23 @@ def send_claim_email(to_email, extracted):
     subject = "Insurance Claim Request"
-    html_body = f"""
     <h2>Insurance Claim Request</h2>
     <p><b>Provider:</b> {extracted['company']}</p>
-    <p><b>Date:</b> {extracted['date']}</p>
-    <p><b>Amount:</b> ₹{extracted['total']}</p>
-    <p>Regards,<br>AI Claims Bot</p>
     """
     try:
-        response = requests.post(
             "https://api.resend.com/emails",
             headers={
                 "Authorization": f"Bearer {RESEND_API_KEY}",
@@ -289,15 +316,15 @@ def send_claim_email(to_email, extracted):
                 "from": FROM_EMAIL,
                 "to": [to_email],
                 "subject": subject,
-                "html": html_body
             },
             timeout=20
         )
-        if response.status_code in [200, 201]:
-            return f"✅ Email sent to {to_email}"
-        return f"❌ Email failed: {response.text}"
     except Exception as e:
         return f"❌ Email error: {str(e)}"
@@ -313,7 +340,6 @@ def process_and_send(image, email_id):
         return extracted, extracted["error"]
     conf = extracted["confidence"]
     decision = decision_layer(conf)
     extracted["decision"] = decision
@@ -322,10 +348,10 @@ def process_and_send(image, email_id):
         email_status = send_claim_email(email_id, extracted)
     elif decision == "REVIEW":
-        email_status = f"⚠️ Manual review required ({conf})"
     else:
-        email_status = f"❌ Rejected ({conf})"
     return extracted, email_status
@@ -346,7 +372,7 @@ demo = gr.Interface(
     ],
     title="📄 AI Insurance Claim Generator",
-    description="Upload receipt → Better AI extraction → Confidence → Auto Email"
 )
 demo.launch()

 # =====================================================
+# AI INSURANCE CLAIM GENERATOR (PRODUCTION FINAL VERSION)
+# Accurate Company + Accurate Total + Email + Confidence
+# HuggingFace Spaces Ready
 # =====================================================
 import gradio as gr
 # =====================================================
 RESEND_API_KEY = os.getenv("RESEND_API_KEY")
+# Use your verified Resend sender email
 FROM_EMAIL = "AI Claims <claims@yudham.com>"
 MODEL_NAME = "ngupta2026/sroie-layoutlm"
 model.eval()
 # =====================================================
+# HELPERS
 # =====================================================
 def normalize(box, width, height):
     return [
         int(1000 * box[3] / height),
     ]
+def avg(lst):
     if len(lst) == 0:
+        return 0.0
     return sum(lst) / len(lst)
 # =====================================================
+# COMPANY CLEANER
 # =====================================================
+def clean_company(txt):
     txt = txt.strip()
+    txt = re.sub(r"[^A-Za-z0-9&().,\- /]", "", txt)
+    txt = re.sub(r"\s+", " ", txt).strip()
+    if len(txt) < 2:
+        return "Not Found"
+    letters = sum(c.isalpha() for c in txt)
+    if letters == 0:
+        return "Not Found"
+    return txt.upper()
 # =====================================================
+# DATE EXTRACTION
 # =====================================================
+def extract_date(words):
+    patterns = [
+        r"\d{2}[/-]\d{2}[/-]\d{2,4}",
+        r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"
+    ]
+    for w in words:
+        for p in patterns:
+            if re.fullmatch(p, w):
+                return w
+    return "Not Found"
+# =====================================================
+# TOTAL EXTRACTION (BEST PRACTICAL METHOD)
+# =====================================================
+def clean_amount_token(txt):
+    txt = txt.upper()
+    txt = txt.replace("RM", "")
+    txt = txt.replace("MYR", "")
+    txt = txt.replace("RS", "")
+    txt = txt.replace("₹", "")
+    txt = txt.replace(",", "")
+    txt = txt.strip()
+    return txt
+def is_money(txt):
+    return re.fullmatch(r"\d+\.\d{2}", txt) is not None
+def extract_total(words):
     candidates = []
     for i, w in enumerate(words):
+        x = clean_amount_token(w)
+        if is_money(x):
+            val = float(x)
+            if 0.50 <= val <= 100000:
+                candidates.append(val)
+    # choose realistic largest decimal value
+    if len(candidates) > 0:
+        return f"{max(candidates):.2f}"
+    # fallback integer values
+    int_candidates = []
+    for w in words:
+        x = clean_amount_token(w)
+        if re.fullmatch(r"\d+", x):
+            val = float(x)
+            if 1 <= val <= 100000:
+                int_candidates.append(val)
+    if len(int_candidates) > 0:
+        return f"{max(int_candidates):.2f}"
+    return "Not Found"
 # =====================================================
+# OCR + MODEL EXTRACTION
 # =====================================================
 def extract_receipt(image):
     try:
         image = image.convert("RGB")
+        image.thumbnail((1500, 1500))
         data = pytesseract.image_to_data(
             image,
         for i in range(len(data["text"])):
+            txt = data["text"][i].strip()
+            if txt != "" and len(txt) > 1:
                 x = data["left"][i]
                 y = data["top"][i]
                 w = data["width"][i]
                 h = data["height"][i]
+                words.append(txt)
                 boxes.append([x, y, x + w, y + h])
         if len(words) == 0:
             words,
             boxes=boxes,
             return_tensors="pt",
             truncation=True,
+            padding="max_length",
+            max_length=512,
+            is_split_into_words=True
         )
         encoding = {k: v.to(device) for k, v in encoding.items()}
         preds = torch.argmax(probs, dim=2)[0][:len(words)]
         confs = torch.max(probs, dim=2)[0][0][:len(words)]
         # -------------------------------------------------
+        # COMPANY FROM MODEL
         # -------------------------------------------------
+        company_tokens = []
+        company_scores = []
         for word, pred, conf in zip(words, preds, confs):
             label = id2label[pred.item()]
             if label == "COMPANY":
+                company_tokens.append(word)
+                company_scores.append(conf.item())
+        # fallback if model misses
+        if company_tokens:
+            company = " ".join(company_tokens[:8])
+        else:
+            company = " ".join(words[:5])
+        company = clean_company(company)
         # -------------------------------------------------
+        # DATE + TOTAL
         # -------------------------------------------------
+        date = extract_date(words)
+        total = extract_total(words)
         # -------------------------------------------------
         # CONFIDENCE
         # -------------------------------------------------
+        company_conf = avg(company_scores)
+        score = company_conf
+        if date != "Not Found":
+            score += 0.12
         if total != "Not Found":
+            score += 0.18
+        score = min(score, 0.99)
+        return {
             "company": company,
             "date": date,
             "total": total,
+            "confidence": round(score, 3)
         }
     except Exception as e:
         return {"error": str(e)}
 # =====================================================
+# DECISION LAYER
 # =====================================================
 def decision_layer(conf):
         return "REJECT"
 # =====================================================
+# EMAIL
 # =====================================================
 def send_claim_email(to_email, extracted):
     subject = "Insurance Claim Request"
+    html = f"""
     <h2>Insurance Claim Request</h2>
+    <p>Dear Team,</p>
+    <p>Please process the reimbursement claim.</p>
     <p><b>Provider:</b> {extracted['company']}</p>
+    <p><b>Bill Date:</b> {extracted['date']}</p>
+    <p><b>Claim Amount:</b> ₹{extracted['total']}</p>
+    <br>
+    <p>Regards,<br>AI Claims System</p>
     """
     try:
+        r = requests.post(
             "https://api.resend.com/emails",
             headers={
                 "Authorization": f"Bearer {RESEND_API_KEY}",
                 "from": FROM_EMAIL,
                 "to": [to_email],
                 "subject": subject,
+                "html": html
             },
             timeout=20
         )
+        if r.status_code in [200, 201]:
+            return f"✅ Email sent successfully to {to_email}"
+        return f"❌ Email failed: {r.text}"
     except Exception as e:
         return f"❌ Email error: {str(e)}"
         return extracted, extracted["error"]
     conf = extracted["confidence"]
     decision = decision_layer(conf)
     extracted["decision"] = decision
         email_status = send_claim_email(email_id, extracted)
     elif decision == "REVIEW":
+        email_status = f"⚠️ Human review required (confidence={conf})"
     else:
+        email_status = f"❌ Rejected (confidence={conf})"
     return extracted, email_status
     ],
     title="📄 AI Insurance Claim Generator",
+    description="Upload receipt → Extract fields → Confidence → Auto Email"
 )
 demo.launch()