Spaces:

deveos
/

mini

Sleeping

App Files Files Community

deveos commited on May 28

Commit

0be9bd5

verified ·

1 Parent(s): 50a756d

Upload 4 files

Browse files

Files changed (3) hide show

README.md +15 -3
app.py +218 -164
requirements.txt +3 -4

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Bank Statement JSON Extractor
 emoji: 📄
 colorFrom: blue
 colorTo: purple
@@ -10,6 +10,18 @@ app_file: app.py
 pinned: false
 ---
-# Bank Statement JSON Extractor
-Fast PDF bank statement to JSON transaction extractor.

 ---
+title: MinerU Bank Statement JSON Extractor
 emoji: 📄
 colorFrom: blue
 colorTo: purple
 pinned: false
 ---
+# MinerU Bank Statement JSON Extractor
+Upload bank statement PDF and extract clean transaction JSON.
+Output fields:
+```json
+{
+  "date": "",
+  "narration": "",
+  "voucher_type": "Payment/Receipt",
+  "amount": "",
+  "closing_balance": ""
+}
+```

app.py CHANGED Viewed

@@ -1,187 +1,241 @@
 import re
 import json
 import tempfile
-from datetime import datetime
-from typing import Any, List, Dict
 import gradio as gr
-import pdfplumber
-MONTHS = {
-    "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04",
-    "May": "05", "Jun": "06", "Jul": "07", "Aug": "08",
-    "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12",
-}
-def clean_cell(value: Any) -> str:
-    if value is None:
-        return ""
-    return re.sub(r"\s+", " ", str(value).replace("\n", " ")).strip()
-def clean_date(value: str) -> str:
-    value = clean_cell(value)
-    # Fix Apr/01/202 4 -> Apr/01/2024
-    value = re.sub(r"(\d{3})\s+(\d)$", r"\1\2", value)
-    m = re.search(r"\b([A-Za-z]{3})/(\d{1,2})/(\d{4})\b", value)
     if not m:
-        return ""
     mon, day, year = m.groups()
-    mon = mon[:3].title()
-    if mon not in MONTHS:
-        return ""
-    return f"{year}-{MONTHS[mon]}-{int(day):02d}"
-def clean_amount(value: str) -> str:
-    """
-    Fix:
-      650000 0.0 Dr -> 6500000.00
-      648820 2.04 Dr -> 6488202.04
-      11962.9 6 Cr -> 11962.96
-      45.0 Dr -> 45.00
-    """
-    value = clean_cell(value)
-    value = re.sub(r"\b(Cr|Dr)\b", "", value, flags=re.I).strip()
-    value = re.sub(r"[^0-9.\s,-]", "", value).replace(",", "")
-    value = re.sub(r"\s+", "", value)
-    if not value:
-        return ""
-    # keep first numeric-looking value after removing spaces
-    m = re.search(r"-?\d+(?:\.\d+)?", value)
-    if not m:
-        return ""
-    num = m.group(0)
     try:
-        return f"{float(num):.2f}"
     except Exception:
-        return num
-def clean_balance(value: str) -> str:
-    raw = clean_cell(value)
-    suffix = ""
-    if re.search(r"\bDr\b", raw, re.I):
-        suffix = " Dr"
-    elif re.search(r"\bCr\b", raw, re.I):
-        suffix = " Cr"
-    amt = clean_amount(raw)
-    return (amt + suffix).strip() if amt else ""
-def clean_narration(value: str) -> str:
-    text = clean_cell(value)
-    # Fix broken account holder name in this bank format
-    text = text.replace("SHARDAP RASAD", "SHARDAPRASAD")
-    text = text.replace("PARASHA R", "PARASHAR")
-    text = text.replace("Maintainanc e", "Maintenance")
-    text = re.sub(r"\b(\d+)\s+(\d+)\b", r"\1\2", text)
-    text = re.sub(r"\s*/\s*", "/", text)
-    return text.strip()
-def is_header_row(row: List[str]) -> bool:
-    joined = " ".join(row).lower()
-    return "txn date" in joined and "narration" in joined
-def parse_pdfplumber_tables(pdf_path: str) -> List[Dict[str, str]]:
-    txns: List[Dict[str, str]] = []
-    with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages:
-            tables = page.extract_tables({
-                "vertical_strategy": "lines",
-                "horizontal_strategy": "lines",
-                "intersection_tolerance": 5,
-                "snap_tolerance": 3,
-                "join_tolerance": 3,
-            }) or page.extract_tables()
-            for table in tables or []:
-                if not table:
-                    continue
-                for row in table:
-                    cells = [clean_cell(c) for c in row]
-                    if not cells or is_header_row(cells):
-                        continue
-                    # Expected Vidarbha columns:
-                    # 0 TXN Date, 1 Value Date, 2 Narration, 3 Ref, 4 Mode, 5 Branch, 6 Debit, 7 Credit, 8 Balance
-                    if len(cells) < 9:
-                        continue
-                    date = clean_date(cells[0])
-                    if not date:
-                        continue
-                    narration = clean_narration(cells[2])
-                    if not narration or "opening balance" in narration.lower():
-                        continue
-                    debit = clean_amount(cells[6])
-                    credit = clean_amount(cells[7])
-                    balance = clean_balance(cells[8])
-                    if debit:
-                        voucher_type = "Payment"
-                        amount = debit
-                    elif credit:
-                        voucher_type = "Receipt"
-                        amount = credit
-                    else:
-                        continue
-                    txns.append({
-                        "date": date,
-                        "narration": narration,
-                        "voucher_type": voucher_type,
-                        "amount": amount,
-                        "closing_balance": balance,
-                    })
-    return txns
-def extract_transactions(pdf_file):
-    if pdf_file is None:
-        return "[]", {"success": False, "error": "Upload PDF first"}
-    try:
-        pdf_path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file)
-        transactions = parse_pdfplumber_tables(pdf_path)
-        result = {
-            "success": True,
-            "count": len(transactions),
-            "transactions": transactions,
-        }
-        return json.dumps(transactions, ensure_ascii=False, indent=2), result
-    except Exception as e:
-        result = {
-            "success": False,
-            "error": str(e),
-            "transactions": [],
-        }
-        return json.dumps([], indent=2), result
-with gr.Blocks(title="Bank Statement JSON Extractor") as demo:
-    gr.Markdown("# Bank Statement PDF → Transaction JSON")
-    gr.Markdown("Fast test app. Output fields: `date`, `narration`, `voucher_type`, `amount`, `closing_balance`.")
-    with gr.Row():
-        pdf = gr.File(label="Upload Bank Statement PDF", file_types=[".pdf"])
-        btn = gr.Button("Extract Transactions", variant="primary")
-    json_out = gr.Code(label="Transactions JSON", language="json", lines=20)
-    status = gr.JSON(label="Status")
-    btn.click(extract_transactions, inputs=[pdf], outputs=[json_out, status])
 if __name__ == "__main__":
     demo.launch()

+import os
 import re
 import json
+import time
 import tempfile
+from typing import Any, Dict, List, Optional, Tuple
 import gradio as gr
+import requests
+from pypdf import PdfReader
+DATE_RE = re.compile(r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{1,2}/\d{4}\b", re.I)
+NUM_RE = re.compile(r"(?<!\d)(\d[\d,]*(?:\.\d+)?)\s*(Cr|Dr)?\b", re.I)
+MONTHS = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
+def normalize_date(s: str) -> str:
+    m = re.match(r"([A-Za-z]{3})/(\d{1,2})/(\d{4})", s.strip())
     if not m:
+        return s.strip()
     mon, day, year = m.groups()
+    return f"{int(year):04d}-{MONTHS[mon.lower()]:02d}-{int(day):02d}"
+def clean_number(s: str) -> str:
+    s = str(s or "").replace(",", "")
+    s = re.sub(r"\s+", "", s)
     try:
+        return f"{float(s):.2f}"
     except Exception:
+        return s
+def clean_balance(num: str, sign: str = "") -> str:
+    val = clean_number(num)
+    sign = (sign or "").strip().title()
+    return f"{val} {sign}".strip()
+def remove_noise(text: str) -> str:
+    text = re.sub(r"\s+", " ", text or " ").strip()
+    noise = [
+        "Vidarbha Merchants Urban Co-op Bank Ltd.", "Vidarbha Merchants Urban Co-op Bank", "Account Statement",
+        "TXN Date", "Value Date", "Narration", "ReferenceNo", "ChequeNo", "Refere nceNo", "Tr Mode",
+        "Branch Code", "Branc h Code", "Debit", "Credit", "Balance", "Balan ce", "Balanc e",
+    ]
+    for n in noise:
+        text = text.replace(n, " ")
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def extract_text_local(pdf_path: str) -> str:
+    reader = PdfReader(pdf_path)
+    pages = []
+    for p in reader.pages:
+        pages.append(p.extract_text() or "")
+    return "\n".join(pages)
+def try_mineru_agent(pdf_path: str, base_url: str, timeout: int = 180) -> Tuple[Optional[str], str]:
+    """Call MinerU Agent style API. Base URL must be set by user.
+    Supports common response shapes and async polling URLs/ids.
+    """
+    base_url = (base_url or os.getenv("MINERU_API_BASE") or "").strip().rstrip("/")
+    if not base_url:
+        return None, "MinerU base URL not set. Used local fallback."
+    upload_url = base_url
+    if not upload_url.endswith("/api/v1/agent/parse/file"):
+        upload_url = base_url + "/api/v1/agent/parse/file"
+    try:
+        with open(pdf_path, "rb") as f:
+            files = {"file": (os.path.basename(pdf_path), f, "application/pdf")}
+            r = requests.post(upload_url, files=files, timeout=60)
+        r.raise_for_status()
+        data = r.json() if "json" in r.headers.get("content-type", "").lower() else {"text": r.text}
+    except Exception as e:
+        return None, f"MinerU upload failed: {e}. Used local fallback."
+    # Direct markdown/text response
+    direct = find_text_in_obj(data)
+    if direct and len(direct) > 80:
+        return direct, "MinerU direct result used."
+    task_id = data.get("task_id") or data.get("taskId") or data.get("id") or data.get("data", {}).get("task_id") or data.get("data", {}).get("taskId")
+    poll_url = data.get("poll_url") or data.get("result_url") or data.get("data", {}).get("poll_url") or data.get("data", {}).get("result_url")
+    if not poll_url and task_id:
+        poll_url = base_url.rsplit("/api/v1/agent/parse/file", 1)[0].rstrip("/") + f"/api/v1/agent/task/{task_id}"
+    if not poll_url:
+        return None, "MinerU did not return markdown or task id. Used local fallback."
+    start = time.time()
+    last = None
+    while time.time() - start < timeout:
+        try:
+            rr = requests.get(poll_url, timeout=30)
+            rr.raise_for_status()
+            last = rr.json() if "json" in rr.headers.get("content-type", "").lower() else {"text": rr.text}
+            status = str(last.get("status") or last.get("data", {}).get("status") or "").lower()
+            found = find_text_in_obj(last)
+            if found and (status in ("done", "completed", "success", "") or len(found) > 80):
+                return found, "MinerU async result used."
+            if status in ("failed", "error"):
+                return None, f"MinerU task failed: {last}. Used local fallback."
+        except Exception as e:
+            return None, f"MinerU polling failed: {e}. Used local fallback."
+        time.sleep(2)
+    return None, f"MinerU timeout. Last={last}. Used local fallback."
+def find_text_in_obj(obj: Any) -> Optional[str]:
+    keys = {"markdown", "md", "text", "content", "result", "html"}
+    if isinstance(obj, str):
+        return obj
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            if k.lower() in keys and isinstance(v, str) and len(v.strip()) > 20:
+                return v
+        for v in obj.values():
+            res = find_text_in_obj(v)
+            if res:
+                return res
+    if isinstance(obj, list):
+        parts = [find_text_in_obj(x) for x in obj]
+        parts = [p for p in parts if p]
+        if parts:
+            return "\n".join(parts)
+    return None
+def split_records(text: str) -> List[str]:
+    text = remove_noise(text)
+    matches = list(DATE_RE.finditer(text))
+    records = []
+    for i, m in enumerate(matches):
+        d = m.group(0)
+        # Skip account From/To date labels by checking context before date
+        before = text[max(0, m.start()-20):m.start()].lower()
+        if "to date" in before or "from date" in before:
+            continue
+        end = matches[i+1].start() if i + 1 < len(matches) else len(text)
+        rec = text[m.start():end].strip()
+        if "Opening Balance" in rec:
+            continue
+        records.append(rec)
+    return records
+def parse_vidarbha_record(rec: str) -> Optional[Dict[str, str]]:
+    dates = DATE_RE.findall(rec)
+    if not dates:
+        return None
+    txn_date = normalize_date(dates[0])
+    # remove first two dates (txn + value date) when present
+    body = rec
+    for d in dates[:2]:
+        body = body.replace(d, " ", 1)
+    body = re.sub(r"\s+", " ", body).strip()
+    # Find amounts with Cr/Dr markers. Last is balance, previous is txn amount.
+    pairs = [(m.group(1), (m.group(2) or ""), m.start(), m.end()) for m in NUM_RE.finditer(body)]
+    signed = [p for p in pairs if p[1]]
+    if len(signed) < 2:
+        return None
+    bal_num, bal_sign, bal_start, bal_end = signed[-1]
+    amt_num, amt_sign, amt_start, amt_end = signed[-2]
+    voucher_type = "Receipt" if amt_sign.lower() == "cr" else "Payment"
+    amount = clean_number(amt_num)
+    closing_balance = clean_balance(bal_num, bal_sign)
+    narration = body[:amt_start].strip()
+    # remove common ref/mode/code fragments at end
+    narration = re.sub(r"\b(?:SC:\s*\d+|\d{1,6})\s+(?:By Clg|To Trf)\s+\d{3,6}\s*$", "", narration, flags=re.I).strip()
+    narration = re.sub(r"\s+", " ", narration).strip(" -")
+    if not narration or narration.lower().startswith(("name :-", "branch name", "account number", "to date", "from date")):
+        return None
+    return {
+        "date": txn_date,
+        "narration": narration,
+        "voucher_type": voucher_type,
+        "amount": amount,
+        "closing_balance": closing_balance,
+    }
+def extract_transactions_from_text(text: str) -> List[Dict[str, str]]:
+    out = []
+    seen = set()
+    for rec in split_records(text):
+        tx = parse_vidarbha_record(rec)
+        if not tx:
+            continue
+        key = tuple(tx.values())
+        if key not in seen:
+            seen.add(key)
+            out.append(tx)
+    return out
+def process_pdf(pdf_file, mineru_base_url: str, use_mineru: bool):
+    if not pdf_file:
+        return {"success": False, "error": "Upload PDF first"}, ""
+    path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file)
+    status = []
+    text = None
+    if use_mineru:
+        text, msg = try_mineru_agent(path, mineru_base_url)
+        status.append(msg)
+    if not text:
+        text = extract_text_local(path)
+        status.append("Local pypdf text extraction used.")
+    txs = extract_transactions_from_text(text)
+    result = {
+        "success": True,
+        "count": len(txs),
+        "transactions": txs,
+    }
+    return result, "\n".join(status) + "\n\n--- extracted text preview ---\n" + text[:2500]
+with gr.Blocks(title="MinerU Bank JSON Extractor") as demo:
+    gr.Markdown("# MinerU Bank Statement → Transaction JSON")
+    gr.Markdown("Output only: `date`, `narration`, `voucher_type`, `amount`, `closing_balance`.")
+    with gr.Row():
+        pdf = gr.File(label="Upload bank statement PDF", file_types=[".pdf"])
+    with gr.Row():
+        use_mineru = gr.Checkbox(value=True, label="Use MinerU Agent API first")
+        mineru_url = gr.Textbox(label="MinerU API Base URL", placeholder="https://your-mineru-host.com", value=os.getenv("MINERU_API_BASE", ""))
+    btn = gr.Button("Extract JSON", variant="primary")
+    output = gr.JSON(label="Transactions JSON")
+    logs = gr.Textbox(label="Logs / Text preview", lines=14)
+    btn.click(process_pdf, inputs=[pdf, mineru_url, use_mineru], outputs=[output, logs])
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-pdfplumber==0.11.4
-pypdfium2==4.30.0
-pillow
-pyaudioop

+requests>=2.31.0
+pypdf>=4.2.0
+pyaudioop>=0.2.0