Spaces:

ngupta2026
/

Gen_AI_Project

Sleeping

App Files Files Community

Gen_AI_Project / app.py

ngupta2026

Update app.py

d14dfaa verified about 1 month ago

raw

history blame contribute delete

10.5 kB

	# =====================================================
	# AI INSURANCE CLAIM GENERATOR
	# FINAL VERSION
	# Accurate Extraction + Professional Claim PDF + Email
	# Hugging Face Space Ready
	# =====================================================

	import gradio as gr
	import pytesseract
	from PIL import Image
	import torch
	import re
	import requests
	import os
	import io
	import base64
	from datetime import datetime

	from transformers import LayoutLMTokenizerFast, LayoutLMForTokenClassification

	# PDF LIBRARY
	from reportlab.lib.pagesizes import A4
	from reportlab.pdfgen import canvas
	from reportlab.lib.colors import HexColor

	# =====================================================
	# CONFIG
	# =====================================================
	RESEND_API_KEY = os.getenv("RESEND_API_KEY")

	FROM_EMAIL = "AI Claims <claims@yudham.com>"
	MODEL_NAME = "ngupta2026/sroie-layoutlm"

	label2id = {
	"O": 0,
	"COMPANY": 1,
	"DATE": 2,
	"TOTAL": 3
	}

	id2label = {v: k for k, v in label2id.items()}

	# =====================================================
	# LOAD MODEL
	# =====================================================
	device = "cuda" if torch.cuda.is_available() else "cpu"

	model = LayoutLMForTokenClassification.from_pretrained(MODEL_NAME)
	tokenizer = LayoutLMTokenizerFast.from_pretrained(MODEL_NAME)

	model.to(device)
	model.eval()

	# =====================================================
	# HELPERS
	# =====================================================
	def normalize(box, width, height):
	return [
	int(1000 * box[0] / width),
	int(1000 * box[1] / height),
	int(1000 * box[2] / width),
	int(1000 * box[3] / height),
	]

	def avg(lst):
	return sum(lst) / len(lst) if len(lst) > 0 else 0

	# =====================================================
	# CLEAN COMPANY
	# =====================================================
	def clean_company(txt):

	txt = txt.strip()
	txt = re.sub(r"[^A-Za-z0-9&().,\- /]", "", txt)
	txt = re.sub(r"\s+", " ", txt).strip()

	if len(txt) < 2:
	return "Not Found"

	return txt.upper()

	# =====================================================
	# DATE EXTRACTION
	# =====================================================
	def extract_date(words):

	for w in words:
	if re.fullmatch(r"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}", w):
	return w

	return "Not Found"

	# =====================================================
	# TOTAL EXTRACTION
	# =====================================================
	def clean_amount_token(txt):

	txt = txt.upper()
	txt = txt.replace("RM", "")
	txt = txt.replace("MYR", "")
	txt = txt.replace("RS", "")
	txt = txt.replace("₹", "")
	txt = txt.replace(",", "")
	txt = txt.strip()

	return txt

	def extract_total(words):

	vals = []

	for w in words:

	x = clean_amount_token(w)

	if re.fullmatch(r"\d+\.\d{2}", x):
	try:
	v = float(x)

	if 0.5 <= v <= 100000:
	vals.append(v)

	except:
	pass

	if vals:
	return f"{max(vals):.2f}"

	return "Not Found"

	# =====================================================
	# PROFESSIONAL CLAIM PDF
	# =====================================================
	def create_pdf_base64(extracted):

	buffer = io.BytesIO()

	p = canvas.Canvas(buffer, pagesize=A4)
	width, height = A4

	# Colors
	blue = HexColor("#0B5ED7")
	dark = HexColor("#222222")
	gray = HexColor("#666666")
	light = HexColor("#F5F7FA")

	# Header Bar
	p.setFillColor(blue)
	p.rect(0, height - 80, width, 80, fill=1, stroke=0)

	p.setFillColorRGB(1, 1, 1)
	p.setFont("Helvetica-Bold", 22)
	p.drawString(40, height - 50, "INSURANCE CLAIM FORM")

	p.setFont("Helvetica", 10)
	p.drawRightString(width - 40, height - 52, "AI Generated Report")

	# Body Box
	p.setFillColor(light)
	p.roundRect(35, height - 430, width - 70, 290, 10, fill=1, stroke=0)

	y = height - 130

	p.setFillColor(dark)
	p.setFont("Helvetica-Bold", 14)
	p.drawString(50, y, "Claim Summary")

	y -= 35

	rows = [
	("Claim ID", f"CLM-{datetime.now().strftime('%Y%m%d%H%M%S')}"),
	("Provider Name", extracted["company"]),
	("Bill Date", extracted["date"]),
	("Claim Amount", f"₹ {extracted['total']}"),
	("Status", "Submitted"),
	("Generated On", datetime.now().strftime("%d-%m-%Y %H:%M"))
	]

	for label, value in rows:

	p.setFillColor(gray)
	p.setFont("Helvetica-Bold", 11)
	p.drawString(55, y, label)

	p.setFillColor(dark)
	p.setFont("Helvetica", 11)
	p.drawString(220, y, str(value))

	y -= 32

	# Footer Notes
	p.setFillColor(gray)
	p.setFont("Helvetica", 9)
	p.drawString(
	40,
	60,
	"This document was generated automatically by AI Insurance Claim Generator."
	)

	p.drawRightString(
	width - 40,
	60,
	"Confidential"
	)

	p.showPage()
	p.save()

	pdf_bytes = buffer.getvalue()
	buffer.close()

	return base64.b64encode(pdf_bytes).decode()

	# =====================================================
	# MAIN EXTRACTION
	# =====================================================
	def extract_receipt(image):

	try:
	image = image.convert("RGB")
	image.thumbnail((1500, 1500))

	data = pytesseract.image_to_data(
	image,
	output_type=pytesseract.Output.DICT
	)

	words = []
	boxes = []

	for i in range(len(data["text"])):

	txt = data["text"][i].strip()

	if txt != "" and len(txt) > 1:

	x = data["left"][i]
	y = data["top"][i]
	w = data["width"][i]
	h = data["height"][i]

	words.append(txt)
	boxes.append([x, y, x + w, y + h])

	if len(words) == 0:
	return {"error": "No text detected"}

	width, height = image.size
	boxes = [normalize(b, width, height) for b in boxes]

	encoding = tokenizer(
	words,
	boxes=boxes,
	return_tensors="pt",
	truncation=True,
	padding="max_length",
	max_length=512,
	is_split_into_words=True
	)

	encoding = {k: v.to(device) for k, v in encoding.items()}

	with torch.no_grad():
	outputs = model(**encoding)

	probs = torch.softmax(outputs.logits, dim=2)

	preds = torch.argmax(probs, dim=2)[0][:len(words)]
	confs = torch.max(probs, dim=2)[0][0][:len(words)]

	company_tokens = []
	company_scores = []

	for word, pred, conf in zip(words, preds, confs):

	label = id2label[pred.item()]

	if label == "COMPANY":
	company_tokens.append(word)
	company_scores.append(conf.item())

	if company_tokens:
	company = " ".join(company_tokens[:8])
	else:
	company = " ".join(words[:5])

	company = clean_company(company)

	date = extract_date(words)
	total = extract_total(words)

	score = avg(company_scores)

	if date != "Not Found":
	score += 0.12

	if total != "Not Found":
	score += 0.18

	score = min(score, 0.99)

	return {
	"company": company,
	"date": date,
	"total": total,
	"confidence": round(score, 3)
	}

	except Exception as e:
	return {"error": str(e)}

	# =====================================================
	# DECISION
	# =====================================================
	def decision_layer(conf):

	if conf >= 0.80:
	return "AUTO_SEND"
	elif conf >= 0.60:
	return "REVIEW"
	else:
	return "REJECT"

	# =====================================================
	# EMAIL SEND
	# =====================================================
	def send_claim_email(to_email, extracted):

	if not RESEND_API_KEY:
	return "❌ Missing RESEND_API_KEY"

	pdf_b64 = create_pdf_base64(extracted)

	subject = "Insurance Claim Request"

	html = f"""
	<h2>Insurance Claim Request</h2>

	<p><b>Provider:</b> {extracted['company']}</p>
	<p><b>Date:</b> {extracted['date']}</p>
	<p><b>Amount:</b> ₹{extracted['total']}</p>

	<p>Attached: Professional Claim PDF</p>
	"""

	try:
	r = requests.post(
	"https://api.resend.com/emails",
	headers={
	"Authorization": f"Bearer {RESEND_API_KEY}",
	"Content-Type": "application/json"
	},
	json={
	"from": FROM_EMAIL,
	"to": [to_email],
	"subject": subject,
	"html": html,
	"attachments": [
	{
	"filename": "claim_report.pdf",
	"content": pdf_b64
	}
	]
	},
	timeout=20
	)

	if r.status_code in [200, 201]:
	return f"✅ Email + PDF sent to {to_email}"

	return f"❌ Email failed: {r.text}"

	except Exception as e:
	return f"❌ Email error: {str(e)}"

	# =====================================================
	# MAIN PIPELINE
	# =====================================================
	def process_and_send(image, email_id):

	extracted = extract_receipt(image)

	if "error" in extracted:
	return extracted, extracted["error"]

	conf = extracted["confidence"]
	decision = decision_layer(conf)

	extracted["decision"] = decision

	if decision == "AUTO_SEND":
	status = send_claim_email(email_id, extracted)

	elif decision == "REVIEW":
	status = f"⚠️ Human review required ({conf})"

	else:
	status = f"❌ Rejected ({conf})"

	return extracted, status

	# =====================================================
	# UI
	# =====================================================
	demo = gr.Interface(
	fn=process_and_send,

	inputs=[
	gr.Image(type="pil", label="Upload Receipt"),
	gr.Textbox(label="Destination Email")
	],

	outputs=[
	gr.JSON(label="AI Extraction"),
	gr.Textbox(label="Email Status")
	],

	title="📄 AI Insurance Claim Generator",
	description="Upload receipt → Extract fields → Generate Claim PDF → Auto Email"
	)

	demo.launch()