Spaces:

build-small-hackathon
/

statementsetu

Sleeping

App Files Files Community

statementsetu / extraction.py

perceptron01

Upload 3 files

6331876 verified 18 days ago

Raw

History Blame Contribute Delete

12.2 kB

	"""Step A + B: extraction of raw transaction rows from a PDF or image.

	Two paths:
	* Digital PDF with a text layer -> pdfplumber tables (no model, fast, free)
	* Scanned PDF or image -> MiniCPM-V vision model, page-by-page

	The vision path is optional: if torch/transformers/spaces aren't installed
	(e.g. running locally on CPU), it degrades gracefully and reports why.
	"""

	import json
	import os
	import re

	# Vision model is loaded lazily; these flags let the app report which path ran.
	# MiniCPM-V-4.6 is the newest MiniCPM-V and is transformers-native (uses the
	# standard AutoModelForImageTextToText API). The older 2_6 relied on a custom
	# .chat() method that breaks on recent transformers.
	VISION_MODEL_ID = os.environ.get("VISION_MODEL_ID", "openbmb/MiniCPM-V-4.6")
	VISION_FALLBACK_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

	VISION_PROMPT = """You are reading one page of an Indian bank statement. Extract EVERY transaction row into JSON.
	Output ONLY a JSON array, no markdown, no commentary. Schema per row:
	{"date": "<as printed>", "narration": "<full narration text>", "ref_no": "<or null>",
	"debit": <number or null>, "credit": <number or null>, "balance": <number or null>}
	Rules:
	- One object per transaction row. Skip headers, footers, page totals, opening balance lines.
	- Amounts: numbers only, no commas, no currency symbols.
	- If a narration wraps across lines, merge it into one string.
	- If a cell is unreadable, use null. NEVER invent values."""


	# --------------------------------------------------------------------------- #
	# Helpers
	# --------------------------------------------------------------------------- #
	def parse_amount(raw):
	"""'1,234.50' / '' / '12500.00 Cr' -> float or None."""
	if raw is None:
	return None
	s = str(raw).strip()
	if not s:
	return None
	s = re.sub(r"[^\d.\-]", "", s.replace(",", ""))
	if s in ("", "-", ".", "-."):
	return None
	try:
	v = float(s)
	except ValueError:
	return None
	return v if v != 0 else None


	def parse_date(raw):
	"""Parse any common Indian date format to ISO 'YYYY-MM-DD'. Returns None on failure."""
	if not raw:
	return None
	s = str(raw).strip()
	# already ISO?
	m = re.match(r"^(\d{4})-(\d{2})-(\d{2})$", s)
	if m:
	return s

	months = {
	"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6,
	"jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12,
	}
	# 1 Apr 2026 / 01-Apr-26 / 1 April 2026
	m = re.match(r"^(\d{1,2})[\s\-/]+([A-Za-z]{3,})[\s\-/]+(\d{2,4})$", s)
	if m:
	d = int(m.group(1))
	mon = months.get(m.group(2)[:3].lower())
	y = int(m.group(3))
	if mon:
	if y < 100:
	y += 2000
	return f"{y:04d}-{mon:02d}-{d:02d}"

	# DD/MM/YYYY or DD-MM-YY (assume day-first, Indian convention)
	m = re.match(r"^(\d{1,2})[/\-.](\d{1,2})[/\-.](\d{2,4})$", s)
	if m:
	d, mon, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
	if y < 100:
	y += 2000
	if 1 <= mon <= 12 and 1 <= d <= 31:
	return f"{y:04d}-{mon:02d}-{d:02d}"
	return None


	def _is_header_or_total(cells):
	joined = " ".join(c for c in cells if c).lower()
	if not joined.strip():
	return True
	skip = ("opening balance", "closing balance", "balance b/f", "carried forward",
	"total", "statement", "narration", "particulars", "date")
	# 'date' alone is the header row; only skip if it looks like a header line
	if joined.startswith("date") and "narration" in joined or "particulars" in joined:
	return True
	return any(k in joined for k in ("opening balance", "closing balance",
	"carried forward", "page total", "grand total"))


	# --------------------------------------------------------------------------- #
	# Digital PDF path (pdfplumber)
	# --------------------------------------------------------------------------- #
	def extract_from_pdf(path, max_pages=5):
	"""Extract transactions from a digital PDF using pdfplumber tables.

	Returns (transactions, meta). meta['path'] == 'text-layer'.
	Raises ValueError if the PDF has no usable text layer.
	"""
	import pdfplumber

	txns = []
	text_chars = 0
	pages_used = 0
	with pdfplumber.open(path) as pdf:
	for page in pdf.pages[:max_pages]:
	pages_used += 1
	text_chars += len((page.extract_text() or ""))
	for table in (page.extract_tables() or []):
	_rows_from_table(table, txns)

	if text_chars < 20:
	raise ValueError("No text layer detected -- this looks like a scan.")
	if not txns:
	raise ValueError("Text layer present but no transaction table found.")

	meta = {"path": "text-layer", "pages": pages_used,
	"engine": "pdfplumber", "gpu_used": False}
	return txns, meta


	def _rows_from_table(table, out):
	"""Turn one pdfplumber table into transaction dicts, appending to `out`."""
	if not table or len(table) < 2:
	return
	for row in table:
	cells = [(c or "").replace("\n", " ").strip() for c in row]
	if _is_header_or_total(cells):
	continue
	date = parse_date(cells[0]) if cells else None
	if not date:
	continue # not a transaction row
	# Expected column order: date, narration, ref, debit, credit, balance
	narration = cells[1] if len(cells) > 1 else ""
	ref = cells[2] if len(cells) > 2 and cells[2] else None
	debit = parse_amount(cells[3]) if len(cells) > 3 else None
	credit = parse_amount(cells[4]) if len(cells) > 4 else None
	balance = parse_amount(cells[5]) if len(cells) > 5 else None
	out.append({
	"date": date,
	"narration": narration,
	"ref_no": ref,
	"debit": debit,
	"credit": credit,
	"balance": balance,
	})


	# --------------------------------------------------------------------------- #
	# Vision path (MiniCPM-V) -- optional, GPU-backed
	# --------------------------------------------------------------------------- #
	_VISION = {"model": None, "processor": None, "id": None}


	def vision_available():
	"""True only if torch is actually usable for inference (not just importable).

	transformers may import while torch is present-but-disabled (e.g. wrong
	version), so we verify a real tensor op works.
	"""
	try:
	import torch # noqa: F401
	from transformers.utils import is_torch_available
	return bool(is_torch_available())
	except Exception:
	return False


	def _load_vision():
	if _VISION["model"] is not None:
	return
	from transformers import AutoModelForImageTextToText, AutoProcessor

	def _load(model_id):
	processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForImageTextToText.from_pretrained(
	model_id, trust_remote_code=True, dtype="auto", device_map="auto")
	return model, processor

	try:
	model_id = VISION_MODEL_ID
	model, processor = _load(model_id)
	except Exception:
	model_id = VISION_FALLBACK_ID
	model, processor = _load(model_id)
	_VISION.update(model=model, processor=processor, id=model_id)


	def _pdf_to_images(path, max_pages=5):
	"""Render PDF pages to PIL images (for scanned PDFs)."""
	import pdfplumber
	images = []
	with pdfplumber.open(path) as pdf:
	for page in pdf.pages[:max_pages]:
	images.append(page.to_image(resolution=150).original)
	return images


	def extract_from_images(images):
	"""Run MiniCPM-V over a list of PIL images. ONE GPU acquisition, loop inside.

	Returns (transactions, meta). Requires torch/transformers (+ a GPU in prod).
	"""
	if not vision_available():
	raise RuntimeError(
	"Vision path needs torch + transformers. On this machine the "
	"digital-PDF (pdfplumber) path is available; vision runs on the "
	"ZeroGPU Space.")
	return _run_vision(images)


	# Decorate with @spaces.GPU only when the `spaces` lib is present (HF Space).
	try:
	import spaces

	@spaces.GPU(duration=180) # first call also loads the ~8B model
	def _run_vision(images):
	return _run_vision_impl(images)
	except Exception:
	def _run_vision(images):
	return _run_vision_impl(images)


	def _run_vision_impl(images):
	"""Actual inference: load model once (device_map=auto), loop over pages.

	Uses the standard transformers multimodal API (apply_chat_template +
	generate), which works for MiniCPM-V-4.6 and the Qwen2.5-VL fallback alike.
	"""
	import torch
	_load_vision()
	model, processor = _VISION["model"], _VISION["processor"]
	is_minicpm = "minicpm" in (_VISION["id"] or "").lower()

	txns = []
	for img in images:
	messages = [{"role": "user", "content": [
	{"type": "image", "image": img},
	{"type": "text", "text": VISION_PROMPT},
	]}]
	tmpl_kwargs = dict(tokenize=True, add_generation_prompt=True,
	return_dict=True, return_tensors="pt")
	gen_kwargs = dict(max_new_tokens=2048, do_sample=False)
	if is_minicpm:
	# MiniCPM-V-4.6-specific knobs (ignored by other models).
	tmpl_kwargs.update(downsample_mode="16x", max_slice_nums=36)
	gen_kwargs.update(downsample_mode="16x")
	try:
	inputs = processor.apply_chat_template(messages, **tmpl_kwargs).to(model.device)
	except TypeError:
	# model doesn't accept the MiniCPM knobs -> retry plain
	for k in ("downsample_mode", "max_slice_nums"):
	tmpl_kwargs.pop(k, None)
	gen_kwargs.pop(k, None)
	inputs = processor.apply_chat_template(messages, **tmpl_kwargs).to(model.device)

	with torch.no_grad():
	out = model.generate(inputs, gen_kwargs)
	trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out)]
	text = processor.batch_decode(
	trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	txns.extend(_parse_model_json(text))

	meta = {"path": "vision", "pages": len(images),
	"engine": _VISION["id"], "gpu_used": bool(torch.cuda.is_available())}
	return txns, meta


	def _parse_model_json(text):
	"""Pull a JSON array out of a model response and normalize the rows."""
	if not text:
	return []
	m = re.search(r"\[.*\]", text, re.DOTALL)
	if not m:
	return []
	try:
	rows = json.loads(m.group(0))
	except Exception:
	return []
	out = []
	for r in rows:
	if not isinstance(r, dict):
	continue
	out.append({
	"date": parse_date(r.get("date")),
	"narration": str(r.get("narration") or "").strip(),
	"ref_no": (str(r["ref_no"]).strip() if r.get("ref_no") else None),
	"debit": parse_amount(r.get("debit")),
	"credit": parse_amount(r.get("credit")),
	"balance": parse_amount(r.get("balance")),
	})
	return out


	# --------------------------------------------------------------------------- #
	# Top-level dispatcher
	# --------------------------------------------------------------------------- #
	def extract(path, max_pages=5):
	"""Auto-detect the right path for `path` (PDF or image) and extract.

	Returns (transactions, meta).
	"""
	ext = os.path.splitext(path)[1].lower()
	if ext == ".pdf":
	try:
	return extract_from_pdf(path, max_pages=max_pages)
	except ValueError:
	# scanned PDF: render pages and run vision
	images = _pdf_to_images(path, max_pages=max_pages)
	return extract_from_images(images)
	elif ext in (".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"):
	from PIL import Image
	img = Image.open(path).convert("RGB")
	return extract_from_images([img])
	else:
	raise ValueError(f"Unsupported file type: {ext}")