| import os |
| import re |
| import json |
| import time |
| import tempfile |
| from typing import Any, Dict, List, Optional, Tuple |
|
|
| import gradio as gr |
| import requests |
| from pypdf import PdfReader |
|
|
| DATE_RE = re.compile(r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{1,2}/\d{4}\b", re.I) |
| NUM_RE = re.compile(r"(?<!\d)(\d[\d,]*(?:\.\d+)?)\s*(Cr|Dr)?\b", re.I) |
| MONTHS = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12} |
|
|
|
|
| def normalize_date(s: str) -> str: |
| m = re.match(r"([A-Za-z]{3})/(\d{1,2})/(\d{4})", s.strip()) |
| if not m: |
| return s.strip() |
| mon, day, year = m.groups() |
| return f"{int(year):04d}-{MONTHS[mon.lower()]:02d}-{int(day):02d}" |
|
|
|
|
| def clean_number(s: str) -> str: |
| s = str(s or "").replace(",", "") |
| s = re.sub(r"\s+", "", s) |
| try: |
| return f"{float(s):.2f}" |
| except Exception: |
| return s |
|
|
|
|
| def clean_balance(num: str, sign: str = "") -> str: |
| val = clean_number(num) |
| sign = (sign or "").strip().title() |
| return f"{val} {sign}".strip() |
|
|
|
|
| def remove_noise(text: str) -> str: |
| text = re.sub(r"\s+", " ", text or " ").strip() |
| noise = [ |
| "Vidarbha Merchants Urban Co-op Bank Ltd.", "Vidarbha Merchants Urban Co-op Bank", "Account Statement", |
| "TXN Date", "Value Date", "Narration", "ReferenceNo", "ChequeNo", "Refere nceNo", "Tr Mode", |
| "Branch Code", "Branc h Code", "Debit", "Credit", "Balance", "Balan ce", "Balanc e", |
| ] |
| for n in noise: |
| text = text.replace(n, " ") |
| text = re.sub(r"\s+", " ", text).strip() |
| return text |
|
|
|
|
| def extract_text_local(pdf_path: str) -> str: |
| reader = PdfReader(pdf_path) |
| pages = [] |
| for p in reader.pages: |
| pages.append(p.extract_text() or "") |
| return "\n".join(pages) |
|
|
|
|
| def try_mineru_agent(pdf_path: str, base_url: str, timeout: int = 180) -> Tuple[Optional[str], str]: |
| """Call MinerU Agent style API. Base URL must be set by user. |
| Supports common response shapes and async polling URLs/ids. |
| """ |
| base_url = (base_url or os.getenv("MINERU_API_BASE") or "").strip().rstrip("/") |
| if not base_url: |
| return None, "MinerU base URL not set. Used local fallback." |
|
|
| upload_url = base_url |
| if not upload_url.endswith("/api/v1/agent/parse/file"): |
| upload_url = base_url + "/api/v1/agent/parse/file" |
|
|
| try: |
| with open(pdf_path, "rb") as f: |
| files = {"file": (os.path.basename(pdf_path), f, "application/pdf")} |
| r = requests.post(upload_url, files=files, timeout=60) |
| r.raise_for_status() |
| data = r.json() if "json" in r.headers.get("content-type", "").lower() else {"text": r.text} |
| except Exception as e: |
| return None, f"MinerU upload failed: {e}. Used local fallback." |
|
|
| |
| direct = find_text_in_obj(data) |
| if direct and len(direct) > 80: |
| return direct, "MinerU direct result used." |
|
|
| task_id = data.get("task_id") or data.get("taskId") or data.get("id") or data.get("data", {}).get("task_id") or data.get("data", {}).get("taskId") |
| poll_url = data.get("poll_url") or data.get("result_url") or data.get("data", {}).get("poll_url") or data.get("data", {}).get("result_url") |
| if not poll_url and task_id: |
| poll_url = base_url.rsplit("/api/v1/agent/parse/file", 1)[0].rstrip("/") + f"/api/v1/agent/task/{task_id}" |
| if not poll_url: |
| return None, "MinerU did not return markdown or task id. Used local fallback." |
|
|
| start = time.time() |
| last = None |
| while time.time() - start < timeout: |
| try: |
| rr = requests.get(poll_url, timeout=30) |
| rr.raise_for_status() |
| last = rr.json() if "json" in rr.headers.get("content-type", "").lower() else {"text": rr.text} |
| status = str(last.get("status") or last.get("data", {}).get("status") or "").lower() |
| found = find_text_in_obj(last) |
| if found and (status in ("done", "completed", "success", "") or len(found) > 80): |
| return found, "MinerU async result used." |
| if status in ("failed", "error"): |
| return None, f"MinerU task failed: {last}. Used local fallback." |
| except Exception as e: |
| return None, f"MinerU polling failed: {e}. Used local fallback." |
| time.sleep(2) |
| return None, f"MinerU timeout. Last={last}. Used local fallback." |
|
|
|
|
| def find_text_in_obj(obj: Any) -> Optional[str]: |
| keys = {"markdown", "md", "text", "content", "result", "html"} |
| if isinstance(obj, str): |
| return obj |
| if isinstance(obj, dict): |
| for k, v in obj.items(): |
| if k.lower() in keys and isinstance(v, str) and len(v.strip()) > 20: |
| return v |
| for v in obj.values(): |
| res = find_text_in_obj(v) |
| if res: |
| return res |
| if isinstance(obj, list): |
| parts = [find_text_in_obj(x) for x in obj] |
| parts = [p for p in parts if p] |
| if parts: |
| return "\n".join(parts) |
| return None |
|
|
|
|
| def split_records(text: str) -> List[str]: |
| text = remove_noise(text) |
| matches = list(DATE_RE.finditer(text)) |
| records = [] |
| for i, m in enumerate(matches): |
| d = m.group(0) |
| |
| before = text[max(0, m.start()-20):m.start()].lower() |
| if "to date" in before or "from date" in before: |
| continue |
| end = matches[i+1].start() if i + 1 < len(matches) else len(text) |
| rec = text[m.start():end].strip() |
| if "Opening Balance" in rec: |
| continue |
| records.append(rec) |
| return records |
|
|
|
|
| def parse_vidarbha_record(rec: str) -> Optional[Dict[str, str]]: |
| dates = DATE_RE.findall(rec) |
| if not dates: |
| return None |
| txn_date = normalize_date(dates[0]) |
| |
| body = rec |
| for d in dates[:2]: |
| body = body.replace(d, " ", 1) |
| body = re.sub(r"\s+", " ", body).strip() |
|
|
| |
| pairs = [(m.group(1), (m.group(2) or ""), m.start(), m.end()) for m in NUM_RE.finditer(body)] |
| signed = [p for p in pairs if p[1]] |
| if len(signed) < 2: |
| return None |
|
|
| bal_num, bal_sign, bal_start, bal_end = signed[-1] |
| amt_num, amt_sign, amt_start, amt_end = signed[-2] |
|
|
| voucher_type = "Receipt" if amt_sign.lower() == "cr" else "Payment" |
| amount = clean_number(amt_num) |
| closing_balance = clean_balance(bal_num, bal_sign) |
|
|
| narration = body[:amt_start].strip() |
| |
| narration = re.sub(r"\b(?:SC:\s*\d+|\d{1,6})\s+(?:By Clg|To Trf)\s+\d{3,6}\s*$", "", narration, flags=re.I).strip() |
| narration = re.sub(r"\s+", " ", narration).strip(" -") |
| if not narration or narration.lower().startswith(("name :-", "branch name", "account number", "to date", "from date")): |
| return None |
|
|
| return { |
| "date": txn_date, |
| "narration": narration, |
| "voucher_type": voucher_type, |
| "amount": amount, |
| "closing_balance": closing_balance, |
| } |
|
|
|
|
| def extract_transactions_from_text(text: str) -> List[Dict[str, str]]: |
| out = [] |
| seen = set() |
| for rec in split_records(text): |
| tx = parse_vidarbha_record(rec) |
| if not tx: |
| continue |
| key = tuple(tx.values()) |
| if key not in seen: |
| seen.add(key) |
| out.append(tx) |
| return out |
|
|
|
|
| def process_pdf(pdf_file, mineru_base_url: str, use_mineru: bool): |
| if not pdf_file: |
| return {"success": False, "error": "Upload PDF first"}, "" |
| path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file) |
| status = [] |
| text = None |
| if use_mineru: |
| text, msg = try_mineru_agent(path, mineru_base_url) |
| status.append(msg) |
| if not text: |
| text = extract_text_local(path) |
| status.append("Local pypdf text extraction used.") |
|
|
| txs = extract_transactions_from_text(text) |
| result = { |
| "success": True, |
| "count": len(txs), |
| "transactions": txs, |
| } |
| return result, "\n".join(status) + "\n\n--- extracted text preview ---\n" + text[:2500] |
|
|
|
|
| with gr.Blocks(title="MinerU Bank JSON Extractor") as demo: |
| gr.Markdown("# MinerU Bank Statement → Transaction JSON") |
| gr.Markdown("Output only: `date`, `narration`, `voucher_type`, `amount`, `closing_balance`.") |
| with gr.Row(): |
| pdf = gr.File(label="Upload bank statement PDF", file_types=[".pdf"]) |
| with gr.Row(): |
| use_mineru = gr.Checkbox(value=True, label="Use MinerU Agent API first") |
| mineru_url = gr.Textbox(label="MinerU API Base URL", placeholder="https://your-mineru-host.com", value=os.getenv("MINERU_API_BASE", "")) |
| btn = gr.Button("Extract JSON", variant="primary") |
| output = gr.JSON(label="Transactions JSON") |
| logs = gr.Textbox(label="Logs / Text preview", lines=14) |
| btn.click(process_pdf, inputs=[pdf, mineru_url, use_mineru], outputs=[output, logs]) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|