import os import re import json import time import tempfile from typing import Any, Dict, List, Optional, Tuple import gradio as gr import requests from pypdf import PdfReader DATE_RE = re.compile(r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{1,2}/\d{4}\b", re.I) NUM_RE = re.compile(r"(? str: m = re.match(r"([A-Za-z]{3})/(\d{1,2})/(\d{4})", s.strip()) if not m: return s.strip() mon, day, year = m.groups() return f"{int(year):04d}-{MONTHS[mon.lower()]:02d}-{int(day):02d}" def clean_number(s: str) -> str: s = str(s or "").replace(",", "") s = re.sub(r"\s+", "", s) try: return f"{float(s):.2f}" except Exception: return s def clean_balance(num: str, sign: str = "") -> str: val = clean_number(num) sign = (sign or "").strip().title() return f"{val} {sign}".strip() def remove_noise(text: str) -> str: text = re.sub(r"\s+", " ", text or " ").strip() noise = [ "Vidarbha Merchants Urban Co-op Bank Ltd.", "Vidarbha Merchants Urban Co-op Bank", "Account Statement", "TXN Date", "Value Date", "Narration", "ReferenceNo", "ChequeNo", "Refere nceNo", "Tr Mode", "Branch Code", "Branc h Code", "Debit", "Credit", "Balance", "Balan ce", "Balanc e", ] for n in noise: text = text.replace(n, " ") text = re.sub(r"\s+", " ", text).strip() return text def extract_text_local(pdf_path: str) -> str: reader = PdfReader(pdf_path) pages = [] for p in reader.pages: pages.append(p.extract_text() or "") return "\n".join(pages) def try_mineru_agent(pdf_path: str, base_url: str, timeout: int = 180) -> Tuple[Optional[str], str]: """Call MinerU Agent style API. Base URL must be set by user. Supports common response shapes and async polling URLs/ids. """ base_url = (base_url or os.getenv("MINERU_API_BASE") or "").strip().rstrip("/") if not base_url: return None, "MinerU base URL not set. Used local fallback." upload_url = base_url if not upload_url.endswith("/api/v1/agent/parse/file"): upload_url = base_url + "/api/v1/agent/parse/file" try: with open(pdf_path, "rb") as f: files = {"file": (os.path.basename(pdf_path), f, "application/pdf")} r = requests.post(upload_url, files=files, timeout=60) r.raise_for_status() data = r.json() if "json" in r.headers.get("content-type", "").lower() else {"text": r.text} except Exception as e: return None, f"MinerU upload failed: {e}. Used local fallback." # Direct markdown/text response direct = find_text_in_obj(data) if direct and len(direct) > 80: return direct, "MinerU direct result used." task_id = data.get("task_id") or data.get("taskId") or data.get("id") or data.get("data", {}).get("task_id") or data.get("data", {}).get("taskId") poll_url = data.get("poll_url") or data.get("result_url") or data.get("data", {}).get("poll_url") or data.get("data", {}).get("result_url") if not poll_url and task_id: poll_url = base_url.rsplit("/api/v1/agent/parse/file", 1)[0].rstrip("/") + f"/api/v1/agent/task/{task_id}" if not poll_url: return None, "MinerU did not return markdown or task id. Used local fallback." start = time.time() last = None while time.time() - start < timeout: try: rr = requests.get(poll_url, timeout=30) rr.raise_for_status() last = rr.json() if "json" in rr.headers.get("content-type", "").lower() else {"text": rr.text} status = str(last.get("status") or last.get("data", {}).get("status") or "").lower() found = find_text_in_obj(last) if found and (status in ("done", "completed", "success", "") or len(found) > 80): return found, "MinerU async result used." if status in ("failed", "error"): return None, f"MinerU task failed: {last}. Used local fallback." except Exception as e: return None, f"MinerU polling failed: {e}. Used local fallback." time.sleep(2) return None, f"MinerU timeout. Last={last}. Used local fallback." def find_text_in_obj(obj: Any) -> Optional[str]: keys = {"markdown", "md", "text", "content", "result", "html"} if isinstance(obj, str): return obj if isinstance(obj, dict): for k, v in obj.items(): if k.lower() in keys and isinstance(v, str) and len(v.strip()) > 20: return v for v in obj.values(): res = find_text_in_obj(v) if res: return res if isinstance(obj, list): parts = [find_text_in_obj(x) for x in obj] parts = [p for p in parts if p] if parts: return "\n".join(parts) return None def split_records(text: str) -> List[str]: text = remove_noise(text) matches = list(DATE_RE.finditer(text)) records = [] for i, m in enumerate(matches): d = m.group(0) # Skip account From/To date labels by checking context before date before = text[max(0, m.start()-20):m.start()].lower() if "to date" in before or "from date" in before: continue end = matches[i+1].start() if i + 1 < len(matches) else len(text) rec = text[m.start():end].strip() if "Opening Balance" in rec: continue records.append(rec) return records def parse_vidarbha_record(rec: str) -> Optional[Dict[str, str]]: dates = DATE_RE.findall(rec) if not dates: return None txn_date = normalize_date(dates[0]) # remove first two dates (txn + value date) when present body = rec for d in dates[:2]: body = body.replace(d, " ", 1) body = re.sub(r"\s+", " ", body).strip() # Find amounts with Cr/Dr markers. Last is balance, previous is txn amount. pairs = [(m.group(1), (m.group(2) or ""), m.start(), m.end()) for m in NUM_RE.finditer(body)] signed = [p for p in pairs if p[1]] if len(signed) < 2: return None bal_num, bal_sign, bal_start, bal_end = signed[-1] amt_num, amt_sign, amt_start, amt_end = signed[-2] voucher_type = "Receipt" if amt_sign.lower() == "cr" else "Payment" amount = clean_number(amt_num) closing_balance = clean_balance(bal_num, bal_sign) narration = body[:amt_start].strip() # remove common ref/mode/code fragments at end narration = re.sub(r"\b(?:SC:\s*\d+|\d{1,6})\s+(?:By Clg|To Trf)\s+\d{3,6}\s*$", "", narration, flags=re.I).strip() narration = re.sub(r"\s+", " ", narration).strip(" -") if not narration or narration.lower().startswith(("name :-", "branch name", "account number", "to date", "from date")): return None return { "date": txn_date, "narration": narration, "voucher_type": voucher_type, "amount": amount, "closing_balance": closing_balance, } def extract_transactions_from_text(text: str) -> List[Dict[str, str]]: out = [] seen = set() for rec in split_records(text): tx = parse_vidarbha_record(rec) if not tx: continue key = tuple(tx.values()) if key not in seen: seen.add(key) out.append(tx) return out def process_pdf(pdf_file, mineru_base_url: str, use_mineru: bool): if not pdf_file: return {"success": False, "error": "Upload PDF first"}, "" path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file) status = [] text = None if use_mineru: text, msg = try_mineru_agent(path, mineru_base_url) status.append(msg) if not text: text = extract_text_local(path) status.append("Local pypdf text extraction used.") txs = extract_transactions_from_text(text) result = { "success": True, "count": len(txs), "transactions": txs, } return result, "\n".join(status) + "\n\n--- extracted text preview ---\n" + text[:2500] with gr.Blocks(title="MinerU Bank JSON Extractor") as demo: gr.Markdown("# MinerU Bank Statement → Transaction JSON") gr.Markdown("Output only: `date`, `narration`, `voucher_type`, `amount`, `closing_balance`.") with gr.Row(): pdf = gr.File(label="Upload bank statement PDF", file_types=[".pdf"]) with gr.Row(): use_mineru = gr.Checkbox(value=True, label="Use MinerU Agent API first") mineru_url = gr.Textbox(label="MinerU API Base URL", placeholder="https://your-mineru-host.com", value=os.getenv("MINERU_API_BASE", "")) btn = gr.Button("Extract JSON", variant="primary") output = gr.JSON(label="Transactions JSON") logs = gr.Textbox(label="Logs / Text preview", lines=14) btn.click(process_pdf, inputs=[pdf, mineru_url, use_mineru], outputs=[output, logs]) if __name__ == "__main__": demo.launch()