"""ConvFinQA agent demo, hosted on Hugging Face Spaces with ZeroGPU.

Two modes:
- Replay: pick one of the 421 dev records and watch the fine-tuned agent
  drive through every turn, showing predicted vs gold answers per turn.
- Chat: pick a record's document, then ask your own free-form numerical
  questions over it.

The agent loop, system prompt, tool-call parser, and calculator are the
same code that ships in the model card's reference loop, so this Space
runs the model the way the README says it should.
"""

import ast
import json
import re

import gradio as gr
import spaces
import torch
from huggingface_hub import hf_hub_download
from transformers import AutoModelForImageTextToText, AutoTokenizer

MODEL_ID = "sharick008/convfinqa-qwen3.5-4b"
DATASET_REPO = "sharick008/convfinqa"
DATASET_FILENAME = "convfinqa_dataset.json"

SYSTEM_PROMPT_PREFIX = """You are a financial analyst reading one page from a 10-K annual report filed
with the U.S. Securities and Exchange Commission. Your job is to answer
numerical questions about this page.

## Conversation format

Each turn asks one question. Later turns in the same conversation may
reference earlier answers (for example "the difference", "that amount").
Reuse numbers you have already derived.

## Tools

- `calculate(expression)`: evaluate an arithmetic expression. Call this
  for every +, -, *, /, ** or percentage step. Copy the returned number
  through verbatim; it is already at the precision the grader expects.
- `submit_answer(value, unit)`: submit the final answer. Call this once
  you have the answer for the current question. The conversation does
  not advance until you call `submit_answer`.

## Table units and scale

Tables often state their scale in headers like
"(in millions, except per share data)" or "(amounts in thousands)".
When the question asks for a value that appears in such a table,
return the cell value AS WRITTEN. Do NOT multiply it out into raw
units. Gold answers in this dataset are in the table's stated scale.

Example: if the table caption says "(in millions)" and the cell
shows 29,500, the answer is 29500 (unit `absolute`), not
29,500,000,000.

If a calculation combines two values from a "(in millions)" table,
the magnitude is preserved: both inputs are in millions and so is the
result.

## Answer units

When you call `submit_answer`, choose the `unit` that matches your
`value`:

- `fraction`: a decimal ratio, e.g. 0.14136 for 14.136%.
- `percent`: a percentage value, e.g. 14.136 for 14.136%.
- `absolute`: a raw count or currency amount with no unit symbol,
  e.g. 206588.
- `count`: a whole-number count, e.g. 4.
- `yes_no`: the string 'yes' or 'no'.

## Worked examples

Percentage change, answered as a fraction.
Question: net cash was 206588 in 2009 and 181001 in 2008; what is the
percentage change? Call `calculate("(206588 - 181001) / 181001")`
which returns 0.14136. Then call
`submit_answer(value=0.14136, unit="fraction")`.

Ratio, answered as a percent.
Question: what amortisation rate does an 8-year useful life represent?
Call `calculate("100 / 8")` which returns 12.5. Then call
`submit_answer(value=12.5, unit="percent")`.

Raw currency amount, no arithmetic needed.
Question: what long-term debt matures in 2017? Read the cell directly
from the table and call `submit_answer(value=307403, unit="absolute")`.

Scaled-table value, no arithmetic.
Question: what was Net cash from financing activities in 2014?
Table caption says "(in millions)" and the cell reads 29,500. Call
`submit_answer(value=29500, unit="absolute")`. Do NOT submit
29500000000 or 29.5 billion. Keep the value in the table's stated
scale.

Boolean.
Question: is net income higher in 2009 than in 2008? Call
`calculate("103102 - 104222")` which returns -1120. Then call
`submit_answer(value="no", unit="yes_no")`.

Unit mismatch.
For the percentage-change question above, do not submit value=14.136
with unit="absolute". That would be flagged as a unit mismatch.
Either submit 0.14136 as `fraction`, or 14.136 as `percent`.

## Document

"""


def render_document(pre_text, table, post_text):
    cols = list(table)
    rows = list(dict.fromkeys(r for col in table.values() for r in col))
    md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"]
    for row in rows:
        cells = [str(table[c].get(row, "")) for c in cols]
        md.append(f"| {row} | " + " | ".join(cells) + " |")
    return (
        "<document>\n"
        f"<pre_text>\n{pre_text}\n</pre_text>\n"
        "<table>\n" + "\n".join(md) + "\n</table>\n"
        f"<post_text>\n{post_text}\n</post_text>\n"
        "</document>"
    )


def render_document_markdown(record):
    """Markdown view for the UI. Cleaner than the XML wrapping the model sees."""
    doc = record["doc"]
    feats = record.get("features", {})
    table, cols = doc["table"], list(doc["table"])
    rows = list(dict.fromkeys(r for col in table.values() for r in col))
    md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"]
    for row in rows:
        cells = [str(table[c].get(row, "")) for c in cols]
        md.append(f"| {row} | " + " | ".join(cells) + " |")
    notes = []
    if feats.get("has_non_numeric_values"):
        notes.append(
            "non-numeric cells (the upstream cleaner sometimes folds first-row "
            "values into the column-name string)"
        )
    if feats.get("has_duplicate_columns"):
        notes.append("duplicate column headers not fully disambiguated by cleaning")
    warning = ""
    if notes:
        warning = (
            "_Note: this record's `features` flags it as having "
            + " and ".join(notes)
            + ". The table below is exactly what the model sees, artefacts and all._\n\n"
        )
    return (
        warning
        + "**Pre-text:**  \n" + doc["pre_text"]
        + "\n\n**Table:**  \n" + "\n".join(md)
        + "\n\n**Post-text:**  \n" + doc["post_text"]
    )


TOOL_CALL_RE = re.compile(
    r"<tool_call>\s*<function=([^>]+)>\s*(.*?)\s*</function>\s*</tool_call>",
    re.DOTALL,
)
PARAM_RE = re.compile(r"<parameter=([^>]+)>\s*(.*?)\s*</parameter>", re.DOTALL)


def parse_tool_calls(text):
    out = []
    for m in TOOL_CALL_RE.finditer(text):
        params = {p.group(1).strip(): p.group(2).strip() for p in PARAM_RE.finditer(m.group(2))}
        out.append({"name": m.group(1).strip(), "arguments": params})
    return out


_BIN = (ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Pow)
_UN = (ast.UAdd, ast.USub)


def _eval_node(node):
    if isinstance(node, ast.Constant):
        if isinstance(node.value, bool) or not isinstance(node.value, (int, float)):
            raise ValueError(f"non-numeric literal: {node.value!r}")
        return float(node.value)
    if isinstance(node, ast.UnaryOp) and isinstance(node.op, _UN):
        v = _eval_node(node.operand)
        return -v if isinstance(node.op, ast.USub) else +v
    if isinstance(node, ast.BinOp) and isinstance(node.op, _BIN):
        l, r = _eval_node(node.left), _eval_node(node.right)
        if isinstance(node.op, ast.Add):  return l + r
        if isinstance(node.op, ast.Sub):  return l - r
        if isinstance(node.op, ast.Mult): return l * r
        if isinstance(node.op, ast.Div):  return l / r
        return l ** r
    raise ValueError(f"disallowed construct: {type(node).__name__}")


def calculate(expression):
    cleaned = str(expression).replace(",", "").strip()
    return round(float(_eval_node(ast.parse(cleaned, mode="eval").body)), 5)


_dataset_path = hf_hub_download(repo_id=DATASET_REPO, filename=DATASET_FILENAME, repo_type="dataset")
with open(_dataset_path) as _f:
    DATASET = json.load(_f)
DEV_RECORDS = {r["id"]: r for r in DATASET["dev"]}
DEV_IDS = sorted(DEV_RECORDS.keys())

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
IM_END_ID = tokenizer.convert_tokens_to_ids("<|im_end|>")

_model = None


def _ensure_model():
    global _model
    if _model is None:
        _model = AutoModelForImageTextToText.from_pretrained(
            MODEL_ID, dtype=torch.bfloat16, device_map="cuda"
        )
        _model.eval()
    return _model


@spaces.GPU(duration=120)
def _run_turn_gpu(messages):
    """One question turn until submit_answer or budget exhausts. Runs on ZeroGPU."""
    model = _ensure_model()
    for _ in range(6):
        prompt = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, enable_thinking=False, tokenize=False
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        out = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            eos_token_id=IM_END_ID,
            pad_token_id=IM_END_ID,
        )
        text = tokenizer.decode(out[0, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        messages.append({"role": "assistant", "content": text})
        calls = parse_tool_calls(text)
        if not calls:
            messages.append(
                {"role": "user", "content": "Please call submit_answer with the final answer for the current question."}
            )
            continue
        for c in calls:
            if c["name"] == "submit_answer":
                return c["arguments"]
            if c["name"] == "calculate":
                result = calculate(c["arguments"]["expression"])
                messages.append({"role": "tool", "content": str(result)})
    raise RuntimeError("no submit_answer after max_iterations")


def grade(pred_value, pred_unit, gold):
    """Match a predicted answer against the gold executed answer. Returns (correct, hint)."""
    if isinstance(gold, str):
        if pred_unit != "yes_no":
            return False, "expected yes_no"
        return str(pred_value).strip().lower() == gold.strip().lower(), ""
    try:
        v = float(pred_value)
    except (TypeError, ValueError):
        return False, "non-numeric"
    if pred_unit == "percent":
        v = v / 100
    elif pred_unit not in ("fraction", "count", "absolute"):
        return False, f"unknown unit {pred_unit!r}"
    return abs(round(v, 5) - float(gold)) < 1e-9, ""


def replay_record(record_id, progress=gr.Progress()):
    if not record_id:
        return "Pick a record from the dropdown.", None, "—"
    record = DEV_RECORDS[record_id]
    doc_md = render_document_markdown(record)
    sys_msg = {
        "role": "system",
        "content": SYSTEM_PROMPT_PREFIX + render_document(
            record["doc"]["pre_text"], record["doc"]["table"], record["doc"]["post_text"]
        ),
    }
    messages = [sys_msg]
    rows = []
    questions = record["dialogue"]["conv_questions"]
    golds = record["dialogue"]["executed_answers"]
    n_correct = 0
    for i, q in enumerate(progress.tqdm(questions, desc="Running turns")):
        messages.append({"role": "user", "content": q})
        try:
            answer = _run_turn_gpu(messages)
            value, unit = answer.get("value", ""), answer.get("unit", "")
            predicted = f"{value} ({unit})"
            ok, hint = grade(value, unit, golds[i])
            if ok:
                n_correct += 1
            verdict = "correct" if ok else f"wrong{(' — ' + hint) if hint else ''}"
        except Exception as e:
            predicted = f"FAILED: {e}"
            verdict = "error"
        rows.append([i, q, predicted, str(golds[i]), verdict])
    summary = f"{n_correct}/{len(questions)} correct on this record."
    return doc_md, rows, summary


def chat_load_doc(record_id):
    if not record_id:
        return "Pick a record from the dropdown.", []
    record = DEV_RECORDS[record_id]
    return render_document_markdown(record), []


def chat_respond(user_msg, history, record_id):
    if not record_id:
        return history + [
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": "Please pick a document from the dropdown first."},
        ], ""
    if not user_msg or not user_msg.strip():
        return history, ""
    record = DEV_RECORDS[record_id]
    sys_content = SYSTEM_PROMPT_PREFIX + render_document(
        record["doc"]["pre_text"], record["doc"]["table"], record["doc"]["post_text"]
    )
    messages = [{"role": "system", "content": sys_content}]
    for h in history:
        if h["role"] in ("user", "assistant"):
            messages.append({"role": h["role"], "content": h["content"]})
    messages.append({"role": "user", "content": user_msg})
    try:
        answer = _run_turn_gpu(messages)
        value, unit = answer.get("value", ""), answer.get("unit", "")
        reply = f"**{value}** _(unit: {unit})_"
    except Exception as e:
        reply = f"_Failed: {e}_"
    new_history = history + [
        {"role": "user", "content": user_msg},
        {"role": "assistant", "content": reply},
    ]
    return new_history, ""


with gr.Blocks(title="ConvFinQA agent") as demo:
    gr.Markdown(
        "# ConvFinQA agent\n"
        "Fine-tuned [Qwen/Qwen3.5-4B](https://huggingface.co/Qwen/Qwen3.5-4B) "
        "(adapter at [sharick008/convfinqa-qwen3.5-4b-lora](https://huggingface.co/sharick008/convfinqa-qwen3.5-4b-lora), "
        "merged at [sharick008/convfinqa-qwen3.5-4b](https://huggingface.co/sharick008/convfinqa-qwen3.5-4b)) "
        "answering multi-turn numerical questions over single-page financial documents from "
        "[ConvFinQA](https://aclanthology.org/2022.emnlp-main.421/) (Chen et al., 2022)."
        "\n\n"
        "First call wakes the GPU and downloads weights — expect ~60s. Subsequent turns are fast."
    )
    with gr.Tabs():
        with gr.Tab("Replay"):
            gr.Markdown(
                "Pick one of the 421 dev records. The agent runs through every turn of "
                "the original conversation and shows the predicted answer next to the "
                "dataset's gold."
            )
            replay_dropdown = gr.Dropdown(
                DEV_IDS, label=f"Dev record ({len(DEV_IDS)} available)", value=DEV_IDS[0]
            )
            replay_run_btn = gr.Button("Run agent through all turns", variant="primary")
            replay_summary = gr.Markdown()
            replay_results = gr.Dataframe(
                headers=["turn", "question", "predicted", "gold", "verdict"],
                wrap=True,
                column_widths=["5%", "45%", "15%", "15%", "20%"],
            )
            with gr.Accordion("Document", open=False):
                replay_doc = gr.Markdown()
            replay_run_btn.click(
                replay_record,
                inputs=[replay_dropdown],
                outputs=[replay_doc, replay_results, replay_summary],
            )
        with gr.Tab("Chat"):
            gr.Markdown(
                "Pick a document, then ask any numerical question over it. Answers come "
                "back as `value (unit)` from the agent's `submit_answer` call."
            )
            chat_dropdown = gr.Dropdown(DEV_IDS, label="Pick a document", value=DEV_IDS[0])
            with gr.Accordion("Document", open=True):
                chat_doc = gr.Markdown()
            chatbot = gr.Chatbot(label="Conversation", type="messages", height=320)
            with gr.Row():
                chat_input = gr.Textbox(
                    placeholder="Ask a numerical question about the document above...",
                    show_label=False,
                    scale=8,
                )
                chat_send = gr.Button("Send", scale=1, variant="primary")
                chat_clear = gr.Button("Clear", scale=1)
            chat_dropdown.change(chat_load_doc, [chat_dropdown], [chat_doc, chatbot])
            chat_input.submit(chat_respond, [chat_input, chatbot, chat_dropdown], [chatbot, chat_input])
            chat_send.click(chat_respond, [chat_input, chatbot, chat_dropdown], [chatbot, chat_input])
            chat_clear.click(lambda: [], None, chatbot)
            demo.load(chat_load_doc, [chat_dropdown], [chat_doc, chatbot])

if __name__ == "__main__":
    demo.launch()