import os
import re
import json
import time
import tempfile
from typing import Any, Dict, List, Optional, Tuple

import gradio as gr
import requests
from pypdf import PdfReader

DATE_RE = re.compile(r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{1,2}/\d{4}\b", re.I)
NUM_RE = re.compile(r"(?<!\d)(\d[\d,]*(?:\.\d+)?)\s*(Cr|Dr)?\b", re.I)
MONTHS = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}


def normalize_date(s: str) -> str:
    m = re.match(r"([A-Za-z]{3})/(\d{1,2})/(\d{4})", s.strip())
    if not m:
        return s.strip()
    mon, day, year = m.groups()
    return f"{int(year):04d}-{MONTHS[mon.lower()]:02d}-{int(day):02d}"


def clean_number(s: str) -> str:
    s = str(s or "").replace(",", "")
    s = re.sub(r"\s+", "", s)
    try:
        return f"{float(s):.2f}"
    except Exception:
        return s


def clean_balance(num: str, sign: str = "") -> str:
    val = clean_number(num)
    sign = (sign or "").strip().title()
    return f"{val} {sign}".strip()


def remove_noise(text: str) -> str:
    text = re.sub(r"\s+", " ", text or " ").strip()
    noise = [
        "Vidarbha Merchants Urban Co-op Bank Ltd.", "Vidarbha Merchants Urban Co-op Bank", "Account Statement",
        "TXN Date", "Value Date", "Narration", "ReferenceNo", "ChequeNo", "Refere nceNo", "Tr Mode",
        "Branch Code", "Branc h Code", "Debit", "Credit", "Balance", "Balan ce", "Balanc e",
    ]
    for n in noise:
        text = text.replace(n, " ")
    text = re.sub(r"\s+", " ", text).strip()
    return text


def extract_text_local(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    pages = []
    for p in reader.pages:
        pages.append(p.extract_text() or "")
    return "\n".join(pages)


def try_mineru_agent(pdf_path: str, base_url: str, timeout: int = 180) -> Tuple[Optional[str], str]:
    """Call MinerU Agent style API. Base URL must be set by user.
    Supports common response shapes and async polling URLs/ids.
    """
    base_url = (base_url or os.getenv("MINERU_API_BASE") or "").strip().rstrip("/")
    if not base_url:
        return None, "MinerU base URL not set. Used local fallback."

    upload_url = base_url
    if not upload_url.endswith("/api/v1/agent/parse/file"):
        upload_url = base_url + "/api/v1/agent/parse/file"

    try:
        with open(pdf_path, "rb") as f:
            files = {"file": (os.path.basename(pdf_path), f, "application/pdf")}
            r = requests.post(upload_url, files=files, timeout=60)
        r.raise_for_status()
        data = r.json() if "json" in r.headers.get("content-type", "").lower() else {"text": r.text}
    except Exception as e:
        return None, f"MinerU upload failed: {e}. Used local fallback."

    # Direct markdown/text response
    direct = find_text_in_obj(data)
    if direct and len(direct) > 80:
        return direct, "MinerU direct result used."

    task_id = data.get("task_id") or data.get("taskId") or data.get("id") or data.get("data", {}).get("task_id") or data.get("data", {}).get("taskId")
    poll_url = data.get("poll_url") or data.get("result_url") or data.get("data", {}).get("poll_url") or data.get("data", {}).get("result_url")
    if not poll_url and task_id:
        poll_url = base_url.rsplit("/api/v1/agent/parse/file", 1)[0].rstrip("/") + f"/api/v1/agent/task/{task_id}"
    if not poll_url:
        return None, "MinerU did not return markdown or task id. Used local fallback."

    start = time.time()
    last = None
    while time.time() - start < timeout:
        try:
            rr = requests.get(poll_url, timeout=30)
            rr.raise_for_status()
            last = rr.json() if "json" in rr.headers.get("content-type", "").lower() else {"text": rr.text}
            status = str(last.get("status") or last.get("data", {}).get("status") or "").lower()
            found = find_text_in_obj(last)
            if found and (status in ("done", "completed", "success", "") or len(found) > 80):
                return found, "MinerU async result used."
            if status in ("failed", "error"):
                return None, f"MinerU task failed: {last}. Used local fallback."
        except Exception as e:
            return None, f"MinerU polling failed: {e}. Used local fallback."
        time.sleep(2)
    return None, f"MinerU timeout. Last={last}. Used local fallback."


def find_text_in_obj(obj: Any) -> Optional[str]:
    keys = {"markdown", "md", "text", "content", "result", "html"}
    if isinstance(obj, str):
        return obj
    if isinstance(obj, dict):
        for k, v in obj.items():
            if k.lower() in keys and isinstance(v, str) and len(v.strip()) > 20:
                return v
        for v in obj.values():
            res = find_text_in_obj(v)
            if res:
                return res
    if isinstance(obj, list):
        parts = [find_text_in_obj(x) for x in obj]
        parts = [p for p in parts if p]
        if parts:
            return "\n".join(parts)
    return None


def split_records(text: str) -> List[str]:
    text = remove_noise(text)
    matches = list(DATE_RE.finditer(text))
    records = []
    for i, m in enumerate(matches):
        d = m.group(0)
        # Skip account From/To date labels by checking context before date
        before = text[max(0, m.start()-20):m.start()].lower()
        if "to date" in before or "from date" in before:
            continue
        end = matches[i+1].start() if i + 1 < len(matches) else len(text)
        rec = text[m.start():end].strip()
        if "Opening Balance" in rec:
            continue
        records.append(rec)
    return records


def parse_vidarbha_record(rec: str) -> Optional[Dict[str, str]]:
    dates = DATE_RE.findall(rec)
    if not dates:
        return None
    txn_date = normalize_date(dates[0])
    # remove first two dates (txn + value date) when present
    body = rec
    for d in dates[:2]:
        body = body.replace(d, " ", 1)
    body = re.sub(r"\s+", " ", body).strip()

    # Find amounts with Cr/Dr markers. Last is balance, previous is txn amount.
    pairs = [(m.group(1), (m.group(2) or ""), m.start(), m.end()) for m in NUM_RE.finditer(body)]
    signed = [p for p in pairs if p[1]]
    if len(signed) < 2:
        return None

    bal_num, bal_sign, bal_start, bal_end = signed[-1]
    amt_num, amt_sign, amt_start, amt_end = signed[-2]

    voucher_type = "Receipt" if amt_sign.lower() == "cr" else "Payment"
    amount = clean_number(amt_num)
    closing_balance = clean_balance(bal_num, bal_sign)

    narration = body[:amt_start].strip()
    # remove common ref/mode/code fragments at end
    narration = re.sub(r"\b(?:SC:\s*\d+|\d{1,6})\s+(?:By Clg|To Trf)\s+\d{3,6}\s*$", "", narration, flags=re.I).strip()
    narration = re.sub(r"\s+", " ", narration).strip(" -")
    if not narration or narration.lower().startswith(("name :-", "branch name", "account number", "to date", "from date")):
        return None

    return {
        "date": txn_date,
        "narration": narration,
        "voucher_type": voucher_type,
        "amount": amount,
        "closing_balance": closing_balance,
    }


def extract_transactions_from_text(text: str) -> List[Dict[str, str]]:
    out = []
    seen = set()
    for rec in split_records(text):
        tx = parse_vidarbha_record(rec)
        if not tx:
            continue
        key = tuple(tx.values())
        if key not in seen:
            seen.add(key)
            out.append(tx)
    return out


def process_pdf(pdf_file, mineru_base_url: str, use_mineru: bool):
    if not pdf_file:
        return {"success": False, "error": "Upload PDF first"}, ""
    path = pdf_file.name if hasattr(pdf_file, "name") else str(pdf_file)
    status = []
    text = None
    if use_mineru:
        text, msg = try_mineru_agent(path, mineru_base_url)
        status.append(msg)
    if not text:
        text = extract_text_local(path)
        status.append("Local pypdf text extraction used.")

    txs = extract_transactions_from_text(text)
    result = {
        "success": True,
        "count": len(txs),
        "transactions": txs,
    }
    return result, "\n".join(status) + "\n\n--- extracted text preview ---\n" + text[:2500]


with gr.Blocks(title="MinerU Bank JSON Extractor") as demo:
    gr.Markdown("# MinerU Bank Statement → Transaction JSON")
    gr.Markdown("Output only: `date`, `narration`, `voucher_type`, `amount`, `closing_balance`.")
    with gr.Row():
        pdf = gr.File(label="Upload bank statement PDF", file_types=[".pdf"])
    with gr.Row():
        use_mineru = gr.Checkbox(value=True, label="Use MinerU Agent API first")
        mineru_url = gr.Textbox(label="MinerU API Base URL", placeholder="https://your-mineru-host.com", value=os.getenv("MINERU_API_BASE", ""))
    btn = gr.Button("Extract JSON", variant="primary")
    output = gr.JSON(label="Transactions JSON")
    logs = gr.Textbox(label="Logs / Text preview", lines=14)
    btn.click(process_pdf, inputs=[pdf, mineru_url, use_mineru], outputs=[output, logs])

if __name__ == "__main__":
    demo.launch()