Spaces:

KarthiEz
/

DocTR-InvoiceExtraction

Sleeping

App Files Files Community

DocTR-InvoiceExtraction / app.py

KarthiEz

Update app.py

1607f1d verified about 1 month ago

raw

history blame contribute delete

32.8 kB

	import os
	import io
	from typing import List
	import gradio as gr
	# docTR imports (PyTorch backend)
	from doctr.io import DocumentFile
	from doctr.models import ocr_predictor

	# ---------- One-time model bootstrap (CPU-friendly) ----------
	# Ensure torch runs in CPU mode on Spaces; docTR auto-detects backend.
	# You can optionally pin threads for stability on small CPU runners:
	os.environ.setdefault("OMP_NUM_THREADS", "4")
	os.environ.setdefault("MKL_NUM_THREADS", "4")

	MODEL = ocr_predictor(pretrained=True) # DBNet + CRNN (default) on PyTorch

	def _collect_text_from_export(exported: dict) -> str:
	"""Flatten docTR exported structure into newline-separated text per page."""
	pages: List[dict] = exported.get("pages", [])
	text_pages: List[str] = []

	for page in pages:
	page_lines = []
	for block in page.get("blocks", []):
	for line in block.get("lines", []):
	# Join word values in the line; fallback robustly
	words = [w.get("value", "") for w in line.get("words", []) if isinstance(w, dict)]
	line_text = " ".join([w for w in words if w])
	if line_text.strip():
	page_lines.append(line_text)
	text_pages.append("\n".join(page_lines).strip())

	# Join pages with a page delimiter
	return ("\n\n" + ("─" * 32) + " PAGE BREAK " + ("─" * 32) + "\n\n").join(
	[tp for tp in text_pages if tp]
	).strip()

	def run_ocr(file: gr.File) -> str:
	if file is None:
	return "No file received."

	name = (file.name or "").lower()

	# Load as DocumentFile (handles PNG/JPG/PDF)
	if name.endswith(".pdf"):
	# Render PDF pages via pdfium backend under the hood (CPU OK)
	doc = DocumentFile.from_pdf(file=file.name)
	else:
	# Single image fallback; also works for TIFF/PNG/JPG
	doc = DocumentFile.from_images([file.name])

	# Inference
	result = MODEL(doc)
	exported = result.export()
	text = _collect_text_from_export(exported)
	print("Extracted Text:\n", text)

	if not text:
	return "No text detected."
	result_json = invoice_text_to_json(text)
	print(json.dumps(result_json, indent=2))
	string_json = json.dumps(result_json, indent=2)
	return string_json

	import re
	import json
	from typing import List, Dict, Any
	import copy
	import numpy as np
	import torch
	from transformers import pipeline
	from sentence_transformers import SentenceTransformer, util

	# ----------------------------- Schema -----------------------------
	SCHEMA_JSON: Dict[str, Any] = {
	"invoice_header": {
	"car_number": None,
	"shipment_number": None,
	"shipping_point": None,
	"currency": None,
	"invoice_number": None,
	"invoice_date": None,
	"order_number": None,
	"customer_order_number": None,
	"our_order_number": None,
	"sales_order_number": None,
	"purchase_order_number": None,
	"order_date": None,
	"supplier_name": None,
	"supplier_address": None,
	"supplier_phone": None,
	"supplier_email": None,
	"supplier_tax_id": None,
	"customer_name": None,
	"customer_address": None,
	"customer_phone": None,
	"customer_email": None,
	"customer_tax_id": None,
	"ship_to_name": None,
	"ship_to_address": None,
	"bill_to_name": None,
	"bill_to_address": None,
	"remit_to_name": None,
	"remit_to_address": None,
	"tax_id": None,
	"tax_registration_number": None,
	"vat_number": None,
	"payment_terms": None,
	"payment_method": None,
	"payment_reference": None,
	"bank_account_number": None,
	"iban": None,
	"swift_code": None,
	"total_before_tax": None,
	"tax_amount": None,
	"tax_rate": None,
	"shipping_charges": None,
	"discount": None,
	"total_due": None,
	"amount_paid": None,
	"balance_due": None,
	"due_date": None,
	"invoice_status": None,
	"reference_number": None,
	"project_code": None,
	"department": None,
	"contact_person": None,
	"notes": None,
	"additional_info": None
	},
	"line_items": [
	{
	"quantity": None,
	"units": None,
	"description": None,
	"footage": None,
	"price": None,
	"amount": None,
	"notes": None
	}
	]
	}
	STATIC_HEADERS: List[str] = list(SCHEMA_JSON["invoice_header"].keys())

	# Synonym map
	SYN2KEY: Dict[str, str] = {
	"invoice no": "invoice_number",
	"invoice number": "invoice_number",
	"invoice#": "invoice_number",
	"inv no": "invoice_number",
	"inv#": "invoice_number",
	"invoice date": "invoice_date",
	"date of invoice": "invoice_date",
	"po no": "purchase_order_number",
	"po number": "purchase_order_number",
	"purchase order": "purchase_order_number",
	"order no": "order_number",
	"order number": "order_number",
	"sales order": "sales_order_number",
	"customer order": "customer_order_number",
	"our order": "our_order_number",
	"due date": "due_date",
	"date of supply": "order_date",
	"gstin": "supplier_tax_id",
	"gstin no": "supplier_tax_id",
	"tax id": "tax_id",
	"vat number": "vat_number",
	"tax registration number": "tax_registration_number",
	"place of supply": "shipping_point",
	"state code": "additional_info",
	"taxable value": "total_before_tax",
	"total value": "total_due",
	"total amount": "total_due",
	"amount due": "total_due",
	"bank": "bank_account_number",
	"account no": "bank_account_number",
	"account number": "bank_account_number",
	"ifs code": "swift_code",
	"ifsc": "payment_reference",
	"swift code": "swift_code",
	"iban": "iban",
	"e-way bill no": "reference_number",
	"eway bill": "reference_number",
	"dispatched via": "additional_info",
	"documents dispatched through": "additional_info",
	"kind attn": "contact_person",
	"billed to": "bill_to_name",
	"receiver": "bill_to_name",
	"shipped to": "ship_to_name",
	"consignee": "ship_to_name",
	}

	def norm(s: str) -> str:
	return re.sub(r"\s+", " ", s).strip()

	def deep_copy_schema() -> Dict[str, Any]:
	return json.loads(json.dumps(SCHEMA_JSON))

	def extract_candidates(text: str) -> Dict[str, str]:
	cands: Dict[str, str] = {}
	for raw in text.splitlines():
	line = raw.strip().strip("\|").strip()
	if not line:
	continue
	if ":" in line:
	if "\|" in raw:
	parts = [p.strip() for p in raw.split("\|") if p.strip()]
	for cell in parts:
	if ":" in cell:
	k, v = cell.split(":", 1)
	cands[norm(k)] = norm(v)
	else:
	k, v = line.split(":", 1)
	cands[norm(k)] = norm(v)
	for raw in text.splitlines():
	m = re.search(r"\b(Taxable\s+Value\|Total\s+Value\|Total\s+Amount\|Amount\s+Due)\b[:\s]([0-9][0-9,](?:\.[0-9]{2})?)", raw, re.I)
	if m:
	k = norm(m.group(1))
	v = norm(m.group(2))
	cands[k] = v
	return cands

	def regex_extract_all(text: str) -> Dict[str, str]:
	out: Dict[str, str] = {}
	m = re.search(r"\bInvoice\s(?:No\.?\|Number\|#)\s[:\-]?\s*([A-Z0-9\-\/]+)", text, re.I)
	if m: out["invoice_number"] = m.group(1)
	m = re.search(r"\bInvoice\sDate\s[:\-]?\s*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{2,4})", text, re.I)
	if m: out["invoice_date"] = m.group(1)
	m = re.search(r"\bPO\s(?:No\.?\|Number)?\s[:\-]?\s*([A-Z0-9\-\/]+)", text, re.I)
	if m: out["purchase_order_number"] = m.group(1)
	m = re.search(r"\bPO\sDate\s[:\-]?\s*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{2,4})", text, re.I)
	if m: out["order_date"] = m.group(1)
	if "order_date" not in out:
	m = re.search(r"\bDate\sof\sSupply\s[:\-]?\s([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{2,4})", text, re.I)
	if m: out["order_date"] = m.group(1)
	m = re.search(r"\bPlace\sof\sSupply\s[:\-]?\s([A-Za-z0-9 ,\-]+)", text, re.I)
	if m: out["shipping_point"] = m.group(1).strip(" \|")
	m = re.search(r"\bGSTIN\s(?:No\.?)?\s[:\-]?\s*([A-Z0-9]{15})", text, re.I)
	if m: out["supplier_tax_id"] = m.group(1)
	m = re.search(r"\bTaxable\sValue\s[:\-]?\s([0-9][0-9,](?:\.[0-9]{2})?)", text, re.I)
	if m: out["total_before_tax"] = m.group(1).replace(",", "")
	cgst = re.search(r"\bCGST\sValue\s[:\-]?\s([0-9][0-9,](?:\.[0-9]{2})?)", text, re.I)
	sgst = re.search(r"\bSGST\sValue\s[:\-]?\s([0-9][0-9,](?:\.[0-9]{2})?)", text, re.I)
	if cgst and sgst:
	try:
	tax_total = float(cgst.group(1).replace(",", "")) + float(sgst.group(1).replace(",", ""))
	out["tax_amount"] = f"{tax_total:.2f}"
	cgstp = re.search(r"\bCGST\s%?\s[:\-]?\s*([0-9]+(?:\.[0-9]+)?)", text, re.I)
	sgstp = re.search(r"\bSGST\s%?\s[:\-]?\s*([0-9]+(?:\.[0-9]+)?)", text, re.I)
	if cgstp and sgstp:
	try:
	rate = float(cgstp.group(1)) + float(sgstp.group(1))
	out["tax_rate"] = f"{rate:g}"
	except:
	pass
	except:
	pass
	m = re.search(r"\bE[-\s]?Way\sbill\sno\.?\s[:\-]?\s([0-9 ]+)", text, re.I)
	if m: out["reference_number"] = m.group(1).strip()
	return out

	def extract_bank_block(text: str) -> Dict[str, str]:
	bank: Dict[str, str] = {}
	m = re.search(r"\bAccount\sName\s:\s*(.+)", text, re.I)
	if m: bank["supplier_name"] = m.group(1).strip()
	m = re.search(r"\bAccount\s(?:No\|Number)\s:\s*([A-Za-z0-9\- ]+)", text, re.I)
	if m: bank["bank_account_number"] = m.group(1).strip()
	m = re.search(r"\bBank\s:\s([A-Za-z0-9 ,\-&]+)", text, re.I)
	if m:
	bank["additional_info"] = ("Bank: " + m.group(1).strip())
	m = re.search(r"\bIFSC?\sCode\s:\s*([A-Za-z0-9]+)", text, re.I)
	if m: bank["payment_reference"] = m.group(1).strip()
	m = re.search(r"\bSWIFT\sCode\s:\s*([A-Za-z0-9]+)", text, re.I)
	if m: bank["swift_code"] = m.group(1).strip()
	branch = re.search(r"\bBranch\s:\s(.+)", text, re.I)
	micr = re.search(r"\bMICR\sCode\s:\s*([0-9]+)", text, re.I)
	extra_bits = []
	if branch: extra_bits.append("Branch: " + branch.group(1).strip())
	if micr: extra_bits.append("MICR: " + micr.group(1).strip())
	if extra_bits:
	bank["additional_info"] = ((bank.get("additional_info") + " \| ") if bank.get("additional_info") else "") + " \| ".join(extra_bits)
	return bank

	def _has_real_items(items) -> bool:
	return (
	isinstance(items, list)
	and any(
	isinstance(row, dict)
	and any(val not in (None, "", "null") for val in row.values())
	for row in items
	)
	)

	def parse_line_items(text: str) -> List[Dict[str, Any]]:
	"""
	Dynamic, header-agnostic line-item extractor.
	- Auto-detects header row (no hardcoded labels)
	- Supports pipe '\|' tables, multi-space/tab tables, and stacked/vertical layouts
	- Fuzzy maps arbitrary headers to: description, quantity, units, price, amount
	- Stitches wrapped descriptions; stops at totals/subtotals
	"""
	import re
	from typing import List, Dict, Any
	import torch
	from sentence_transformers import SentenceTransformer, util

	# ---- local helpers (encapsulated; no external edits required) ----
	def _tokenize_row(row: str) -> List[str]:
	if "\|" in row:
	toks = [c.strip(" -") for c in row.split("\|")]
	else:
	toks = re.split(r"\t+\| {2,}", row)
	toks = [c.strip(" -") for c in toks]
	return [t for t in toks if t]

	def _looks_like_separator(row: str) -> bool:
	return bool(re.fullmatch(r"[-=–—\s]+", row))

	def _numlike(s: str) -> bool:
	return bool(re.fullmatch(r"[₹$€]?\s\d[\d,](?:\.\d+)?", s.strip()))

	def _normalize_num(s: str \| None) -> str \| None:
	if not s: return None
	return s.replace(",", "").replace("₹", "").replace("$", "").replace("€", "").strip() or None

	STOP = re.compile(r"^\s*(subtotal\|tax\|vat\|gst\|cgst\|sgst\|igst\|total\b\|grand total\|amount due\|balance due)\b", re.I)

	# Canonical targets + synonyms (broad, non-brittle)
	CANON = ["description", "quantity", "units", "price", "amount"]
	SYN = {
	"description": ["description", "item", "details", "product", "material", "article", "part no", "part", "goods desc"],
	"quantity": ["qty", "quantity", "qnty", "pcs", "pieces", "units qty", "ordered qty"],
	"units": ["uom", "unit", "units", "measure", "type", "pkg", "pack", "u/m"],
	"price": ["rate", "price", "unit price", "cost", "u/price", "list price"],
	"amount": ["amount", "total", "line total", "ext price", "net", "value", "extended"]
	}

	def _find_header_idx(lines: List[str]) -> int:
	"""Heuristic header detection for horizontal tables."""
	for i, row in enumerate(lines):
	if _looks_like_separator(row):
	continue
	toks = _tokenize_row(row)
	if len(toks) < 3:
	continue
	# low numeric density
	if sum(_numlike(t) for t in toks) > len(toks) // 2:
	continue
	# at least 2 synonym hits
	hits = 0
	lowt = [t.lower() for t in toks]
	for t in lowt:
	for syns in SYN.values():
	if any(s in t for s in syns):
	hits += 1
	break
	if hits >= 2:
	return i
	return -1

	def _map_headers_dynamic(header_tokens: List[str], model) -> Dict[int, str]:
	"""
	Map arbitrary header tokens to canonical keys via:
	1) direct/synonym contains
	2) semantic similarity (best match)
	"""
	mapped: Dict[int, str] = {}
	used = set()

	low = [h.lower() for h in header_tokens]
	# 1) substring / synonyms
	for j, h in enumerate(low):
	for key, syns in SYN.items():
	if any(s in h for s in syns):
	if key not in used:
	mapped[j] = key
	used.add(key)
	break

	# 2) semantic backstop for unmapped
	remaining = [j for j in range(len(header_tokens)) if j not in mapped]
	if remaining:
	label_texts, label_keys = [], []
	for k, syns in SYN.items():
	for s in syns + [k]:
	label_texts.append(s)
	label_keys.append(k)
	h_emb = model.encode([header_tokens[i] for i in remaining], normalize_embeddings=True)
	l_emb = model.encode(label_texts, normalize_embeddings=True)
	sim = util.cos_sim(torch.tensor(h_emb), torch.tensor(l_emb)).cpu().numpy()
	for ri, j in enumerate(remaining):
	k_best = int(sim[ri].argmax())
	key = label_keys[k_best]
	if key not in used:
	mapped[j] = key
	used.add(key)

	return mapped

	def _parse_horizontal(lines: List[str]) -> List[Dict[str, Any]]:
	"""Parse pipe/whitespace horizontal tables with dynamic headers."""
	header_idx = _find_header_idx(lines)
	if header_idx == -1:
	return []

	header_tokens = _tokenize_row(lines[header_idx])

	# lazy singleton on the function for perf (no external changes)
	if not hasattr(parse_line_items, "_sent_model"):
	parse_line_items._sent_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # type: ignore[attr-defined]
	sm = parse_line_items._sent_model # type: ignore[attr-defined]

	idx2key = _map_headers_dynamic(header_tokens, sm)

	items: List[Dict[str, Any]] = []
	for row in lines[header_idx + 1:]:
	if _looks_like_separator(row):
	continue
	if STOP.search(row):
	break

	toks = _tokenize_row(row)

	# continuation-line heuristic (wrapped description)
	if (len(toks) == 1 or len(toks) < (max(idx2key.keys(), default=-1) + 1)) and items:
	last = items[-1]
	prev = (last.get("description") or "").strip()
	last["description"] = (prev + " " + toks[0]).strip() if toks else prev
	continue

	rowd = {"description": None, "quantity": None, "units": None,
	"price": None, "amount": None, "footage": None, "notes": None}

	for j, tok in enumerate(toks):
	key = idx2key.get(j)
	if not key:
	continue
	val = tok.strip()
	if key in ("quantity", "price", "amount"):
	val = _normalize_num(val)
	rowd[key] = val or rowd.get(key)

	if rowd["quantity"] and rowd["units"]:
	rowd["footage"] = f'{rowd["quantity"]} {rowd["units"]}'

	if any(rowd.get(k) for k in ("description", "amount", "price")):
	items.append(rowd)

	# prune empties
	return [it for it in items if any(v for k, v in it.items() if k != "notes")]

	def _parse_vertical(text: str) -> List[Dict[str, Any]]:
	"""
	Deterministic stacked/vertical parser for blocks like:

	Description
	Type
	Quantity
	Rate
	Amount
	<desc1>
	<type1>
	<qty1>
	<rate1>
	<amt1>
	<desc2> ...

	Stops at totals/subtotals.
	"""
	lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
	if not lines:
	return []

	# Find the exact 5-label header block (order-agnostic but contiguous)
	LABELS = ["description", "type", "quantity", "rate", "amount"]
	def is_label(s: str) -> str \| None:
	t = s.lower()
	if re.fullmatch(r"[₹$€]?\s\d[\d,](?:\.\d+)?", t):
	return None
	if "desc" in t or "item" in t or "product" in t or "material" in t or "article" in t:
	return "description"
	if "type" in t or "uom" in t or "unit" in t or "units" in t:
	return "type"
	if "qty" in t or "quantity" in t:
	return "quantity"
	if "rate" in t or "price" in t or "unit price" in t:
	return "rate"
	if "amount" in t or "total" in t:
	return "amount"
	return None

	start = -1
	for i in range(len(lines) - 4):
	block = lines[i:i+5]
	mapped = [is_label(x) for x in block]
	if None not in mapped and len(set(mapped)) == 5:
	start = i
	header_keys = mapped # e.g. ["description","type","quantity","rate","amount"]
	break
	if start == -1:
	return []

	# Build a position→canonical map in this exact order
	pos2key = {idx: key for idx, key in enumerate(header_keys)}

	# Consume values in chunks of 5
	items: List[Dict[str, Any]] = []
	i = start + 5
	STOP = re.compile(r"^\s*(subtotal\|tax\|vat\|gst\|cgst\|sgst\|igst\|total\b\|grand total\|amount due\|balance due)\b", re.I)

	def norm_num(s: str \| None) -> str \| None:
	if not s: return None
	return s.replace(",", "").replace("₹", "").replace("$", "").replace("€", "").strip() or None

	while i + 4 < len(lines):
	if STOP.search(lines[i]): # hit totals, bail
	break
	chunk = lines[i:i+5]

	row = {"description": None, "units": None, "quantity": None,
	"price": None, "amount": None, "footage": None, "notes": None}

	# map chunk by discovered order
	for j, val in enumerate(chunk):
	key = pos2key[j]
	if key == "type":
	row["units"] = val # map "Type" -> "units"
	elif key == "quantity":
	row["quantity"] = norm_num(val)
	elif key == "rate":
	row["price"] = norm_num(val)
	elif key == "amount":
	row["amount"] = norm_num(val)
	elif key == "description":
	row["description"] = val

	if row["quantity"] and row["units"]:
	row["footage"] = f'{row["quantity"]} {row["units"]}'

	# minimal acceptance: description or amount or price
	if any(row.get(k) for k in ("description", "amount", "price")):
	items.append(row)

	i += 5

	return items

	# ---- main body ----
	raw_lines = [ln.rstrip() for ln in text.splitlines()]
	lines = [ln for ln in raw_lines if ln.strip()]
	if not lines:
	return []

	# 1) Try horizontal first
	items = _parse_horizontal(lines)
	if items:
	return items

	# 2) Fallback to vertical/stacked
	items = _parse_vertical(text)
	return items



	def semantic_map_candidates(candidates: Dict[str, str], static_headers: List[str], thresh: float, sentence_model) -> Dict[str, str]:
	if not candidates:
	return {}
	cand_keys = list(candidates.keys())
	mapped: Dict[str, str] = {}
	leftovers: Dict[str, str] = {}
	for k, v in candidates.items():
	lk = k.lower()
	lk_norm = re.sub(r"[^a-z0-9]+", " ", lk).strip()
	hit = None
	for syn, key in SYN2KEY.items():
	if syn in lk_norm:
	hit = key
	break
	if hit:
	mapped[hit] = v
	else:
	leftovers[k] = v
	if leftovers:
	cand_emb = sentence_model.encode(list(leftovers.keys()), normalize_embeddings=True)
	head_emb = sentence_model.encode(static_headers, normalize_embeddings=True)
	M = util.cos_sim(torch.tensor(cand_emb), torch.tensor(head_emb)).cpu().numpy()
	keys_left = list(leftovers.keys())
	for i, ck in enumerate(keys_left):
	j = int(np.argmax(M[i]))
	score = float(M[i][j])
	if score >= thresh:
	mapped[static_headers[j]] = leftovers[ck]
	return mapped

	def build_prompt(invoice_text: str, mapped_hints: Dict[str, str], items_hints: List[Dict[str, Any]]) -> str:
	instruction = (
	'Use this schema:\n'
	'{\n'
	' "invoice_header": {\n'
	' "car_number": "string or null",\n'
	' "shipment_number": "string or null",\n'
	' "shipping_point": "string or null",\n'
	' "currency": "string or null",\n'
	' "invoice_number": "string or null",\n'
	' "invoice_date": "string or null",\n'
	' "order_number": "string or null",\n'
	' "customer_order_number": "string or null",\n'
	' "our_order_number": "string or null",\n'
	' "sales_order_number": "string or null",\n'
	' "purchase_order_number": "string or null",\n'
	' "order_date": "string or null",\n'
	' "supplier_name": "string or null",\n'
	' "supplier_address": "string or null",\n'
	' "supplier_phone": "string or null",\n'
	' "supplier_email": "string or null",\n'
	' "supplier_tax_id": "string or null",\n'
	' "customer_name": "string or null",\n'
	' "customer_address": "string or null",\n'
	' "customer_phone": "string or null",\n'
	' "customer_email": "string or null",\n'
	' "customer_tax_id": "string or null",\n'
	' "ship_to_name": "string or null",\n'
	' "ship_to_address": "string or null",\n'
	' "bill_to_name": "string or null",\n'
	' "bill_to_address": "string or null",\n'
	' "remit_to_name": "string or null",\n'
	' "remit_to_address": "string or null",\n'
	' "tax_id": "string or null",\n'
	' "tax_registration_number": "string or null",\n'
	' "vat_number": "string or null",\n'
	' "payment_terms": "string or null",\n'
	' "payment_method": "string or null",\n'
	' "payment_reference": "string or null",\n'
	' "bank_account_number": "string or null",\n'
	' "iban": "string or null",\n'
	' "swift_code": "string or null",\n'
	' "total_before_tax": "string or null",\n'
	' "tax_amount": "string or null",\n'
	' "tax_rate": "string or null",\n'
	' "shipping_charges": "string or null",\n'
	' "discount": "string or null",\n'
	' "total_due": "string or null",\n'
	' "amount_paid": "string or null",\n'
	' "balance_due": "string or null",\n'
	' "due_date": "string or null",\n'
	' "invoice_status": "string or null",\n'
	' "reference_number": "string or null",\n'
	' "project_code": "string or null",\n'
	' "department": "string or null",\n'
	' "contact_person": "string or null",\n'
	' "notes": "string or null",\n'
	' "additional_info": "string or null"\n'
	' },\n'
	' "line_items": [\n'
	' {\n'
	' "quantity": "string or null",\n'
	' "units": "string or null",\n'
	' "description": "string or null",\n'
	' "footage": "string or null",\n'
	' "price": "string or null",\n'
	' "amount": "string or null",\n'
	' "notes": "string or null"\n'
	' }\n'
	' ]\n'
	'}\n'
	'If a field is missing for a line item or header, use null. '
	'Do not invent fields. Do not add any header or shipment data to any line item. '
	'Return ONLY the JSON object, no explanation.\n'
	)
	hints = ""
	if mapped_hints:
	hints += "\nHints (header):\n" + " ".join([f"#{k}: {v}" for k, v in mapped_hints.items()])
	if items_hints:
	try:
	hints += "\nHints (line_items):\n" + json.dumps(items_hints, ensure_ascii=False)
	except:
	pass
	return instruction + "\nInvoice Text:\n" + invoice_text.strip() + hints

	def strict_json(text: str) -> Dict[str, Any]:
	try:
	return json.loads(text)
	except:
	pass
	start = text.find("{")
	end = text.rfind("}")
	if start != -1 and end != -1 and end > start:
	try:
	return json.loads(text[start:end+1])
	except:
	pass
	raise ValueError("Model did not return valid JSON.")

	def merge_schema(rule_json: Dict[str, Any], model_json: Dict[str, Any]) -> Dict[str, Any]:
	final = copy.deepcopy(rule_json)

	# --- headers (rules win where present) ---
	hdr = final["invoice_header"]
	mdl_hdr = (model_json.get("invoice_header") or {})
	for k in hdr.keys():
	if hdr[k] in [None, "", "null"]:
	v = mdl_hdr.get(k, None)
	if v not in [None, "", "null"]:
	hdr[k] = v

	# --- line_items (prefer parsed items -> model -> empty) ---
	rule_items = rule_json.get("line_items") or []
	model_items = model_json.get("line_items") or []

	if _has_real_items(rule_items):
	final["line_items"] = rule_items
	elif _has_real_items(model_items):
	final["line_items"] = model_items
	else:
	final["line_items"] = []

	return final

	def _prune_empty_items(payload: Dict[str, Any]) -> Dict[str, Any]:
	items = payload.get("line_items")
	if isinstance(items, list):
	payload["line_items"] = [
	it for it in items
	if isinstance(it, dict) and any(v not in (None, "", "null") for v in it.values())
	]
	return payload


	# ---------------------- MAIN FUNCTION ----------------------
	def invoice_text_to_json(
	invoice_text: str,
	threshold: float = 0.60,
	max_new_tokens: int = 512
	) -> Dict[str, Any]:
	# Load models once (cache if you like for production)
	sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	json_converter = pipeline("text2text-generation", model="yahyakhoder/MD2JSON-T5-small-V1")

	txt = invoice_text

	# 1) Deterministic extraction
	candidates = extract_candidates(txt)
	hard = regex_extract_all(txt)
	bank = extract_bank_block(txt)
	items = parse_line_items(txt)
	print("Extracted line items:", items)

	sem_mapped = semantic_map_candidates(candidates, STATIC_HEADERS, threshold, sentence_model)
	header_found: Dict[str, Any] = {}
	header_found.update(sem_mapped)
	header_found.update(hard)
	header_found.update(bank)

	# 2) Build RULE JSON (schema-shaped, rules filled)
	rule_json = deep_copy_schema()
	if _has_real_items(items):
	rule_json["line_items"] = items
	else:
	rule_json["line_items"] = []
	for k, v in header_found.items():
	if k in rule_json["invoice_header"]:
	rule_json["invoice_header"][k] = v


	# 3) MD2JSON generation with strong hints
	prompt = build_prompt(txt, header_found, items)
	gen = json_converter(prompt, max_new_tokens=max_new_tokens)[0]["generated_text"]
	try:
	model_json = strict_json(gen)
	except Exception as e:
	model_json = deep_copy_schema() # model failed; keep empty shape

	# 4) Final merge (rules win)
	final_json = merge_schema(rule_json, model_json)
	final_json = _prune_empty_items(final_json)
	return final_json

	from typing import Optional

	# ----- replace old run_ocr with unified dispatcher -----
	def run_pipeline(file: Optional[gr.File], raw_txt: Optional[str]) -> str:
	"""
	Orchestrates two intake lanes:
	1) If raw_txt is provided (non-empty), skip OCR → directly map to schema.
	2) Else, run OCR on the uploaded file and map to schema.
	"""
	raw_txt = (raw_txt or "").strip()

	# Lane A: Raw text → JSON
	if raw_txt:
	try:
	result_json = invoice_text_to_json(raw_txt)
	return json.dumps(result_json, indent=2, ensure_ascii=False)
	except Exception as e:
	return f"Error while converting pasted text to JSON schema: {e}"

	# Lane B: File → OCR → JSON
	if not file:
	return "No input received. Upload an image/PDF or paste raw text."

	try:
	name = (file.name or "").lower()

	# Load as DocumentFile (handles PNG/JPG/PDF)
	if name.endswith(".pdf"):
	doc = DocumentFile.from_pdf(file=file.name)
	else:
	doc = DocumentFile.from_images([file.name])

	# Inference
	result = MODEL(doc)
	exported = result.export()
	text = _collect_text_from_export(exported)
	if not text:
	return "No text detected by OCR."

	result_json = invoice_text_to_json(text)
	return json.dumps(result_json, indent=2, ensure_ascii=False)

	except Exception as e:
	return f"OCR pipeline error: {e}"


	# ---------- Gradio UI ----------
	# ---------- Gradio UI ----------
	TITLE = "docTR OCR — Text Extractor"
	DESC = (
	"Upload an image or PDF OR paste raw text. Uses docTR for OCR or directly maps raw text to the invoice JSON schema."
	)

	with gr.Blocks(theme="soft", title=TITLE) as demo:
	gr.Markdown(f"# {TITLE}\n{DESC}")

	with gr.Tabs():
	with gr.Tab("Upload File"):
	inp = gr.File(
	label="Upload image/PDF",
	file_types=[".png", ".jpg", ".jpeg", ".tif", ".tiff", ".pdf"]
	)
	# keep symmetrical inputs for single-click wiring
	raw_txt_hidden = gr.Textbox(visible=False)

	with gr.Tab("Paste Raw Text"):
	raw_txt = gr.Textbox(
	label="Paste raw invoice text (we’ll map directly to JSON schema)",
	lines=18,
	placeholder="Paste the OCR’d/plain text of the invoice here…"
	)
	file_hidden = gr.File(visible=False)

	out = gr.Code(label="Extracted JSON", language="json")
	run_btn = gr.Button("Generate JSON", variant="primary")

	# One button → unified function; we pass both lanes (visible/hidden)
	run_btn.click(
	fn=run_pipeline,
	inputs=[inp, raw_txt],
	outputs=out,
	)

	gr.Markdown(
	"ℹ️ Usage: Prefer Paste Raw Text when you already have text. "
	"If both file and text are provided, we’ll prioritize the pasted text."
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=True, show_error=True)