"""ConvFinQA agent demo, hosted on Hugging Face Spaces with ZeroGPU. Two modes: - Replay: pick one of the 421 dev records and watch the fine-tuned agent drive through every turn, showing predicted vs gold answers per turn. - Chat: pick a record's document, then ask your own free-form numerical questions over it. The agent loop, system prompt, tool-call parser, and calculator are the same code that ships in the model card's reference loop, so this Space runs the model the way the README says it should. """ import ast import json import re import gradio as gr import spaces import torch from huggingface_hub import hf_hub_download from transformers import AutoModelForImageTextToText, AutoTokenizer MODEL_ID = "sharick008/convfinqa-qwen3.5-4b" DATASET_REPO = "sharick008/convfinqa" DATASET_FILENAME = "convfinqa_dataset.json" SYSTEM_PROMPT_PREFIX = """You are a financial analyst reading one page from a 10-K annual report filed with the U.S. Securities and Exchange Commission. Your job is to answer numerical questions about this page. ## Conversation format Each turn asks one question. Later turns in the same conversation may reference earlier answers (for example "the difference", "that amount"). Reuse numbers you have already derived. ## Tools - `calculate(expression)`: evaluate an arithmetic expression. Call this for every +, -, *, /, ** or percentage step. Copy the returned number through verbatim; it is already at the precision the grader expects. - `submit_answer(value, unit)`: submit the final answer. Call this once you have the answer for the current question. The conversation does not advance until you call `submit_answer`. ## Table units and scale Tables often state their scale in headers like "(in millions, except per share data)" or "(amounts in thousands)". When the question asks for a value that appears in such a table, return the cell value AS WRITTEN. Do NOT multiply it out into raw units. Gold answers in this dataset are in the table's stated scale. Example: if the table caption says "(in millions)" and the cell shows 29,500, the answer is 29500 (unit `absolute`), not 29,500,000,000. If a calculation combines two values from a "(in millions)" table, the magnitude is preserved: both inputs are in millions and so is the result. ## Answer units When you call `submit_answer`, choose the `unit` that matches your `value`: - `fraction`: a decimal ratio, e.g. 0.14136 for 14.136%. - `percent`: a percentage value, e.g. 14.136 for 14.136%. - `absolute`: a raw count or currency amount with no unit symbol, e.g. 206588. - `count`: a whole-number count, e.g. 4. - `yes_no`: the string 'yes' or 'no'. ## Worked examples Percentage change, answered as a fraction. Question: net cash was 206588 in 2009 and 181001 in 2008; what is the percentage change? Call `calculate("(206588 - 181001) / 181001")` which returns 0.14136. Then call `submit_answer(value=0.14136, unit="fraction")`. Ratio, answered as a percent. Question: what amortisation rate does an 8-year useful life represent? Call `calculate("100 / 8")` which returns 12.5. Then call `submit_answer(value=12.5, unit="percent")`. Raw currency amount, no arithmetic needed. Question: what long-term debt matures in 2017? Read the cell directly from the table and call `submit_answer(value=307403, unit="absolute")`. Scaled-table value, no arithmetic. Question: what was Net cash from financing activities in 2014? Table caption says "(in millions)" and the cell reads 29,500. Call `submit_answer(value=29500, unit="absolute")`. Do NOT submit 29500000000 or 29.5 billion. Keep the value in the table's stated scale. Boolean. Question: is net income higher in 2009 than in 2008? Call `calculate("103102 - 104222")` which returns -1120. Then call `submit_answer(value="no", unit="yes_no")`. Unit mismatch. For the percentage-change question above, do not submit value=14.136 with unit="absolute". That would be flagged as a unit mismatch. Either submit 0.14136 as `fraction`, or 14.136 as `percent`. ## Document """ def render_document(pre_text, table, post_text): cols = list(table) rows = list(dict.fromkeys(r for col in table.values() for r in col)) md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"] for row in rows: cells = [str(table[c].get(row, "")) for c in cols] md.append(f"| {row} | " + " | ".join(cells) + " |") return ( "\n" f"\n{pre_text}\n\n" "\n" + "\n".join(md) + "\n
\n" f"\n{post_text}\n\n" "
" ) def render_document_markdown(record): """Markdown view for the UI. Cleaner than the XML wrapping the model sees.""" doc = record["doc"] feats = record.get("features", {}) table, cols = doc["table"], list(doc["table"]) rows = list(dict.fromkeys(r for col in table.values() for r in col)) md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"] for row in rows: cells = [str(table[c].get(row, "")) for c in cols] md.append(f"| {row} | " + " | ".join(cells) + " |") notes = [] if feats.get("has_non_numeric_values"): notes.append( "non-numeric cells (the upstream cleaner sometimes folds first-row " "values into the column-name string)" ) if feats.get("has_duplicate_columns"): notes.append("duplicate column headers not fully disambiguated by cleaning") warning = "" if notes: warning = ( "_Note: this record's `features` flags it as having " + " and ".join(notes) + ". The table below is exactly what the model sees, artefacts and all._\n\n" ) return ( warning + "**Pre-text:** \n" + doc["pre_text"] + "\n\n**Table:** \n" + "\n".join(md) + "\n\n**Post-text:** \n" + doc["post_text"] ) TOOL_CALL_RE = re.compile( r"\s*]+)>\s*(.*?)\s*\s*", re.DOTALL, ) PARAM_RE = re.compile(r"]+)>\s*(.*?)\s*", re.DOTALL) def parse_tool_calls(text): out = [] for m in TOOL_CALL_RE.finditer(text): params = {p.group(1).strip(): p.group(2).strip() for p in PARAM_RE.finditer(m.group(2))} out.append({"name": m.group(1).strip(), "arguments": params}) return out _BIN = (ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Pow) _UN = (ast.UAdd, ast.USub) def _eval_node(node): if isinstance(node, ast.Constant): if isinstance(node.value, bool) or not isinstance(node.value, (int, float)): raise ValueError(f"non-numeric literal: {node.value!r}") return float(node.value) if isinstance(node, ast.UnaryOp) and isinstance(node.op, _UN): v = _eval_node(node.operand) return -v if isinstance(node.op, ast.USub) else +v if isinstance(node, ast.BinOp) and isinstance(node.op, _BIN): l, r = _eval_node(node.left), _eval_node(node.right) if isinstance(node.op, ast.Add): return l + r if isinstance(node.op, ast.Sub): return l - r if isinstance(node.op, ast.Mult): return l * r if isinstance(node.op, ast.Div): return l / r return l ** r raise ValueError(f"disallowed construct: {type(node).__name__}") def calculate(expression): cleaned = str(expression).replace(",", "").strip() return round(float(_eval_node(ast.parse(cleaned, mode="eval").body)), 5) _dataset_path = hf_hub_download(repo_id=DATASET_REPO, filename=DATASET_FILENAME, repo_type="dataset") with open(_dataset_path) as _f: DATASET = json.load(_f) DEV_RECORDS = {r["id"]: r for r in DATASET["dev"]} DEV_IDS = sorted(DEV_RECORDS.keys()) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) IM_END_ID = tokenizer.convert_tokens_to_ids("<|im_end|>") _model = None def _ensure_model(): global _model if _model is None: _model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, dtype=torch.bfloat16, device_map="cuda" ) _model.eval() return _model @spaces.GPU(duration=120) def _run_turn_gpu(messages): """One question turn until submit_answer or budget exhausts. Runs on ZeroGPU.""" model = _ensure_model() for _ in range(6): prompt = tokenizer.apply_chat_template( messages, add_generation_prompt=True, enable_thinking=False, tokenize=False ) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) out = model.generate( **inputs, max_new_tokens=512, do_sample=False, eos_token_id=IM_END_ID, pad_token_id=IM_END_ID, ) text = tokenizer.decode(out[0, inputs.input_ids.shape[1]:], skip_special_tokens=True) messages.append({"role": "assistant", "content": text}) calls = parse_tool_calls(text) if not calls: messages.append( {"role": "user", "content": "Please call submit_answer with the final answer for the current question."} ) continue for c in calls: if c["name"] == "submit_answer": return c["arguments"] if c["name"] == "calculate": result = calculate(c["arguments"]["expression"]) messages.append({"role": "tool", "content": str(result)}) raise RuntimeError("no submit_answer after max_iterations") def grade(pred_value, pred_unit, gold): """Match a predicted answer against the gold executed answer. Returns (correct, hint).""" if isinstance(gold, str): if pred_unit != "yes_no": return False, "expected yes_no" return str(pred_value).strip().lower() == gold.strip().lower(), "" try: v = float(pred_value) except (TypeError, ValueError): return False, "non-numeric" if pred_unit == "percent": v = v / 100 elif pred_unit not in ("fraction", "count", "absolute"): return False, f"unknown unit {pred_unit!r}" return abs(round(v, 5) - float(gold)) < 1e-9, "" def replay_record(record_id, progress=gr.Progress()): if not record_id: return "Pick a record from the dropdown.", None, "—" record = DEV_RECORDS[record_id] doc_md = render_document_markdown(record) sys_msg = { "role": "system", "content": SYSTEM_PROMPT_PREFIX + render_document( record["doc"]["pre_text"], record["doc"]["table"], record["doc"]["post_text"] ), } messages = [sys_msg] rows = [] questions = record["dialogue"]["conv_questions"] golds = record["dialogue"]["executed_answers"] n_correct = 0 for i, q in enumerate(progress.tqdm(questions, desc="Running turns")): messages.append({"role": "user", "content": q}) try: answer = _run_turn_gpu(messages) value, unit = answer.get("value", ""), answer.get("unit", "") predicted = f"{value} ({unit})" ok, hint = grade(value, unit, golds[i]) if ok: n_correct += 1 verdict = "correct" if ok else f"wrong{(' — ' + hint) if hint else ''}" except Exception as e: predicted = f"FAILED: {e}" verdict = "error" rows.append([i, q, predicted, str(golds[i]), verdict]) summary = f"{n_correct}/{len(questions)} correct on this record." return doc_md, rows, summary def chat_load_doc(record_id): if not record_id: return "Pick a record from the dropdown.", [] record = DEV_RECORDS[record_id] return render_document_markdown(record), [] def chat_respond(user_msg, history, record_id): if not record_id: return history + [ {"role": "user", "content": user_msg}, {"role": "assistant", "content": "Please pick a document from the dropdown first."}, ], "" if not user_msg or not user_msg.strip(): return history, "" record = DEV_RECORDS[record_id] sys_content = SYSTEM_PROMPT_PREFIX + render_document( record["doc"]["pre_text"], record["doc"]["table"], record["doc"]["post_text"] ) messages = [{"role": "system", "content": sys_content}] for h in history: if h["role"] in ("user", "assistant"): messages.append({"role": h["role"], "content": h["content"]}) messages.append({"role": "user", "content": user_msg}) try: answer = _run_turn_gpu(messages) value, unit = answer.get("value", ""), answer.get("unit", "") reply = f"**{value}** _(unit: {unit})_" except Exception as e: reply = f"_Failed: {e}_" new_history = history + [ {"role": "user", "content": user_msg}, {"role": "assistant", "content": reply}, ] return new_history, "" with gr.Blocks(title="ConvFinQA agent") as demo: gr.Markdown( "# ConvFinQA agent\n" "Fine-tuned [Qwen/Qwen3.5-4B](https://huggingface.co/Qwen/Qwen3.5-4B) " "(adapter at [sharick008/convfinqa-qwen3.5-4b-lora](https://huggingface.co/sharick008/convfinqa-qwen3.5-4b-lora), " "merged at [sharick008/convfinqa-qwen3.5-4b](https://huggingface.co/sharick008/convfinqa-qwen3.5-4b)) " "answering multi-turn numerical questions over single-page financial documents from " "[ConvFinQA](https://aclanthology.org/2022.emnlp-main.421/) (Chen et al., 2022)." "\n\n" "First call wakes the GPU and downloads weights — expect ~60s. Subsequent turns are fast." ) with gr.Tabs(): with gr.Tab("Replay"): gr.Markdown( "Pick one of the 421 dev records. The agent runs through every turn of " "the original conversation and shows the predicted answer next to the " "dataset's gold." ) replay_dropdown = gr.Dropdown( DEV_IDS, label=f"Dev record ({len(DEV_IDS)} available)", value=DEV_IDS[0] ) replay_run_btn = gr.Button("Run agent through all turns", variant="primary") replay_summary = gr.Markdown() replay_results = gr.Dataframe( headers=["turn", "question", "predicted", "gold", "verdict"], wrap=True, column_widths=["5%", "45%", "15%", "15%", "20%"], ) with gr.Accordion("Document", open=False): replay_doc = gr.Markdown() replay_run_btn.click( replay_record, inputs=[replay_dropdown], outputs=[replay_doc, replay_results, replay_summary], ) with gr.Tab("Chat"): gr.Markdown( "Pick a document, then ask any numerical question over it. Answers come " "back as `value (unit)` from the agent's `submit_answer` call." ) chat_dropdown = gr.Dropdown(DEV_IDS, label="Pick a document", value=DEV_IDS[0]) with gr.Accordion("Document", open=True): chat_doc = gr.Markdown() chatbot = gr.Chatbot(label="Conversation", type="messages", height=320) with gr.Row(): chat_input = gr.Textbox( placeholder="Ask a numerical question about the document above...", show_label=False, scale=8, ) chat_send = gr.Button("Send", scale=1, variant="primary") chat_clear = gr.Button("Clear", scale=1) chat_dropdown.change(chat_load_doc, [chat_dropdown], [chat_doc, chatbot]) chat_input.submit(chat_respond, [chat_input, chatbot, chat_dropdown], [chatbot, chat_input]) chat_send.click(chat_respond, [chat_input, chatbot, chat_dropdown], [chatbot, chat_input]) chat_clear.click(lambda: [], None, chatbot) demo.load(chat_load_doc, [chat_dropdown], [chat_doc, chatbot]) if __name__ == "__main__": demo.launch()