# ===================================================== # AI INSURANCE CLAIM GENERATOR # FINAL VERSION # Accurate Extraction + Professional Claim PDF + Email # Hugging Face Space Ready # ===================================================== import gradio as gr import pytesseract from PIL import Image import torch import re import requests import os import io import base64 from datetime import datetime from transformers import LayoutLMTokenizerFast, LayoutLMForTokenClassification # PDF LIBRARY from reportlab.lib.pagesizes import A4 from reportlab.pdfgen import canvas from reportlab.lib.colors import HexColor # ===================================================== # CONFIG # ===================================================== RESEND_API_KEY = os.getenv("RESEND_API_KEY") FROM_EMAIL = "AI Claims " MODEL_NAME = "ngupta2026/sroie-layoutlm" label2id = { "O": 0, "COMPANY": 1, "DATE": 2, "TOTAL": 3 } id2label = {v: k for k, v in label2id.items()} # ===================================================== # LOAD MODEL # ===================================================== device = "cuda" if torch.cuda.is_available() else "cpu" model = LayoutLMForTokenClassification.from_pretrained(MODEL_NAME) tokenizer = LayoutLMTokenizerFast.from_pretrained(MODEL_NAME) model.to(device) model.eval() # ===================================================== # HELPERS # ===================================================== def normalize(box, width, height): return [ int(1000 * box[0] / width), int(1000 * box[1] / height), int(1000 * box[2] / width), int(1000 * box[3] / height), ] def avg(lst): return sum(lst) / len(lst) if len(lst) > 0 else 0 # ===================================================== # CLEAN COMPANY # ===================================================== def clean_company(txt): txt = txt.strip() txt = re.sub(r"[^A-Za-z0-9&().,\- /]", "", txt) txt = re.sub(r"\s+", " ", txt).strip() if len(txt) < 2: return "Not Found" return txt.upper() # ===================================================== # DATE EXTRACTION # ===================================================== def extract_date(words): for w in words: if re.fullmatch(r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}", w): return w return "Not Found" # ===================================================== # TOTAL EXTRACTION # ===================================================== def clean_amount_token(txt): txt = txt.upper() txt = txt.replace("RM", "") txt = txt.replace("MYR", "") txt = txt.replace("RS", "") txt = txt.replace("₹", "") txt = txt.replace(",", "") txt = txt.strip() return txt def extract_total(words): vals = [] for w in words: x = clean_amount_token(w) if re.fullmatch(r"\d+\.\d{2}", x): try: v = float(x) if 0.5 <= v <= 100000: vals.append(v) except: pass if vals: return f"{max(vals):.2f}" return "Not Found" # ===================================================== # PROFESSIONAL CLAIM PDF # ===================================================== def create_pdf_base64(extracted): buffer = io.BytesIO() p = canvas.Canvas(buffer, pagesize=A4) width, height = A4 # Colors blue = HexColor("#0B5ED7") dark = HexColor("#222222") gray = HexColor("#666666") light = HexColor("#F5F7FA") # Header Bar p.setFillColor(blue) p.rect(0, height - 80, width, 80, fill=1, stroke=0) p.setFillColorRGB(1, 1, 1) p.setFont("Helvetica-Bold", 22) p.drawString(40, height - 50, "INSURANCE CLAIM FORM") p.setFont("Helvetica", 10) p.drawRightString(width - 40, height - 52, "AI Generated Report") # Body Box p.setFillColor(light) p.roundRect(35, height - 430, width - 70, 290, 10, fill=1, stroke=0) y = height - 130 p.setFillColor(dark) p.setFont("Helvetica-Bold", 14) p.drawString(50, y, "Claim Summary") y -= 35 rows = [ ("Claim ID", f"CLM-{datetime.now().strftime('%Y%m%d%H%M%S')}"), ("Provider Name", extracted["company"]), ("Bill Date", extracted["date"]), ("Claim Amount", f"₹ {extracted['total']}"), ("Status", "Submitted"), ("Generated On", datetime.now().strftime("%d-%m-%Y %H:%M")) ] for label, value in rows: p.setFillColor(gray) p.setFont("Helvetica-Bold", 11) p.drawString(55, y, label) p.setFillColor(dark) p.setFont("Helvetica", 11) p.drawString(220, y, str(value)) y -= 32 # Footer Notes p.setFillColor(gray) p.setFont("Helvetica", 9) p.drawString( 40, 60, "This document was generated automatically by AI Insurance Claim Generator." ) p.drawRightString( width - 40, 60, "Confidential" ) p.showPage() p.save() pdf_bytes = buffer.getvalue() buffer.close() return base64.b64encode(pdf_bytes).decode() # ===================================================== # MAIN EXTRACTION # ===================================================== def extract_receipt(image): try: image = image.convert("RGB") image.thumbnail((1500, 1500)) data = pytesseract.image_to_data( image, output_type=pytesseract.Output.DICT ) words = [] boxes = [] for i in range(len(data["text"])): txt = data["text"][i].strip() if txt != "" and len(txt) > 1: x = data["left"][i] y = data["top"][i] w = data["width"][i] h = data["height"][i] words.append(txt) boxes.append([x, y, x + w, y + h]) if len(words) == 0: return {"error": "No text detected"} width, height = image.size boxes = [normalize(b, width, height) for b in boxes] encoding = tokenizer( words, boxes=boxes, return_tensors="pt", truncation=True, padding="max_length", max_length=512, is_split_into_words=True ) encoding = {k: v.to(device) for k, v in encoding.items()} with torch.no_grad(): outputs = model(**encoding) probs = torch.softmax(outputs.logits, dim=2) preds = torch.argmax(probs, dim=2)[0][:len(words)] confs = torch.max(probs, dim=2)[0][0][:len(words)] company_tokens = [] company_scores = [] for word, pred, conf in zip(words, preds, confs): label = id2label[pred.item()] if label == "COMPANY": company_tokens.append(word) company_scores.append(conf.item()) if company_tokens: company = " ".join(company_tokens[:8]) else: company = " ".join(words[:5]) company = clean_company(company) date = extract_date(words) total = extract_total(words) score = avg(company_scores) if date != "Not Found": score += 0.12 if total != "Not Found": score += 0.18 score = min(score, 0.99) return { "company": company, "date": date, "total": total, "confidence": round(score, 3) } except Exception as e: return {"error": str(e)} # ===================================================== # DECISION # ===================================================== def decision_layer(conf): if conf >= 0.80: return "AUTO_SEND" elif conf >= 0.60: return "REVIEW" else: return "REJECT" # ===================================================== # EMAIL SEND # ===================================================== def send_claim_email(to_email, extracted): if not RESEND_API_KEY: return "❌ Missing RESEND_API_KEY" pdf_b64 = create_pdf_base64(extracted) subject = "Insurance Claim Request" html = f"""

Insurance Claim Request

Provider: {extracted['company']}

Date: {extracted['date']}

Amount: ₹{extracted['total']}

Attached: Professional Claim PDF

""" try: r = requests.post( "https://api.resend.com/emails", headers={ "Authorization": f"Bearer {RESEND_API_KEY}", "Content-Type": "application/json" }, json={ "from": FROM_EMAIL, "to": [to_email], "subject": subject, "html": html, "attachments": [ { "filename": "claim_report.pdf", "content": pdf_b64 } ] }, timeout=20 ) if r.status_code in [200, 201]: return f"✅ Email + PDF sent to {to_email}" return f"❌ Email failed: {r.text}" except Exception as e: return f"❌ Email error: {str(e)}" # ===================================================== # MAIN PIPELINE # ===================================================== def process_and_send(image, email_id): extracted = extract_receipt(image) if "error" in extracted: return extracted, extracted["error"] conf = extracted["confidence"] decision = decision_layer(conf) extracted["decision"] = decision if decision == "AUTO_SEND": status = send_claim_email(email_id, extracted) elif decision == "REVIEW": status = f"⚠️ Human review required ({conf})" else: status = f"❌ Rejected ({conf})" return extracted, status # ===================================================== # UI # ===================================================== demo = gr.Interface( fn=process_and_send, inputs=[ gr.Image(type="pil", label="Upload Receipt"), gr.Textbox(label="Destination Email") ], outputs=[ gr.JSON(label="AI Extraction"), gr.Textbox(label="Email Status") ], title="📄 AI Insurance Claim Generator", description="Upload receipt → Extract fields → Generate Claim PDF → Auto Email" ) demo.launch()