Spaces:
Sleeping
Sleeping
ui: warn when a record's table has cleaning artefacts (non-numeric cells, duplicate columns)
e7e92c8 verified | """ConvFinQA agent demo, hosted on Hugging Face Spaces with ZeroGPU. | |
| Two modes: | |
| - Replay: pick one of the 421 dev records and watch the fine-tuned agent | |
| drive through every turn, showing predicted vs gold answers per turn. | |
| - Chat: pick a record's document, then ask your own free-form numerical | |
| questions over it. | |
| The agent loop, system prompt, tool-call parser, and calculator are the | |
| same code that ships in the model card's reference loop, so this Space | |
| runs the model the way the README says it should. | |
| """ | |
| import ast | |
| import json | |
| import re | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from transformers import AutoModelForImageTextToText, AutoTokenizer | |
| MODEL_ID = "sharick008/convfinqa-qwen3.5-4b" | |
| DATASET_REPO = "sharick008/convfinqa" | |
| DATASET_FILENAME = "convfinqa_dataset.json" | |
| SYSTEM_PROMPT_PREFIX = """You are a financial analyst reading one page from a 10-K annual report filed | |
| with the U.S. Securities and Exchange Commission. Your job is to answer | |
| numerical questions about this page. | |
| ## Conversation format | |
| Each turn asks one question. Later turns in the same conversation may | |
| reference earlier answers (for example "the difference", "that amount"). | |
| Reuse numbers you have already derived. | |
| ## Tools | |
| - `calculate(expression)`: evaluate an arithmetic expression. Call this | |
| for every +, -, *, /, ** or percentage step. Copy the returned number | |
| through verbatim; it is already at the precision the grader expects. | |
| - `submit_answer(value, unit)`: submit the final answer. Call this once | |
| you have the answer for the current question. The conversation does | |
| not advance until you call `submit_answer`. | |
| ## Table units and scale | |
| Tables often state their scale in headers like | |
| "(in millions, except per share data)" or "(amounts in thousands)". | |
| When the question asks for a value that appears in such a table, | |
| return the cell value AS WRITTEN. Do NOT multiply it out into raw | |
| units. Gold answers in this dataset are in the table's stated scale. | |
| Example: if the table caption says "(in millions)" and the cell | |
| shows 29,500, the answer is 29500 (unit `absolute`), not | |
| 29,500,000,000. | |
| If a calculation combines two values from a "(in millions)" table, | |
| the magnitude is preserved: both inputs are in millions and so is the | |
| result. | |
| ## Answer units | |
| When you call `submit_answer`, choose the `unit` that matches your | |
| `value`: | |
| - `fraction`: a decimal ratio, e.g. 0.14136 for 14.136%. | |
| - `percent`: a percentage value, e.g. 14.136 for 14.136%. | |
| - `absolute`: a raw count or currency amount with no unit symbol, | |
| e.g. 206588. | |
| - `count`: a whole-number count, e.g. 4. | |
| - `yes_no`: the string 'yes' or 'no'. | |
| ## Worked examples | |
| Percentage change, answered as a fraction. | |
| Question: net cash was 206588 in 2009 and 181001 in 2008; what is the | |
| percentage change? Call `calculate("(206588 - 181001) / 181001")` | |
| which returns 0.14136. Then call | |
| `submit_answer(value=0.14136, unit="fraction")`. | |
| Ratio, answered as a percent. | |
| Question: what amortisation rate does an 8-year useful life represent? | |
| Call `calculate("100 / 8")` which returns 12.5. Then call | |
| `submit_answer(value=12.5, unit="percent")`. | |
| Raw currency amount, no arithmetic needed. | |
| Question: what long-term debt matures in 2017? Read the cell directly | |
| from the table and call `submit_answer(value=307403, unit="absolute")`. | |
| Scaled-table value, no arithmetic. | |
| Question: what was Net cash from financing activities in 2014? | |
| Table caption says "(in millions)" and the cell reads 29,500. Call | |
| `submit_answer(value=29500, unit="absolute")`. Do NOT submit | |
| 29500000000 or 29.5 billion. Keep the value in the table's stated | |
| scale. | |
| Boolean. | |
| Question: is net income higher in 2009 than in 2008? Call | |
| `calculate("103102 - 104222")` which returns -1120. Then call | |
| `submit_answer(value="no", unit="yes_no")`. | |
| Unit mismatch. | |
| For the percentage-change question above, do not submit value=14.136 | |
| with unit="absolute". That would be flagged as a unit mismatch. | |
| Either submit 0.14136 as `fraction`, or 14.136 as `percent`. | |
| ## Document | |
| """ | |
| def render_document(pre_text, table, post_text): | |
| cols = list(table) | |
| rows = list(dict.fromkeys(r for col in table.values() for r in col)) | |
| md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"] | |
| for row in rows: | |
| cells = [str(table[c].get(row, "")) for c in cols] | |
| md.append(f"| {row} | " + " | ".join(cells) + " |") | |
| return ( | |
| "<document>\n" | |
| f"<pre_text>\n{pre_text}\n</pre_text>\n" | |
| "<table>\n" + "\n".join(md) + "\n</table>\n" | |
| f"<post_text>\n{post_text}\n</post_text>\n" | |
| "</document>" | |
| ) | |
| def render_document_markdown(record): | |
| """Markdown view for the UI. Cleaner than the XML wrapping the model sees.""" | |
| doc = record["doc"] | |
| feats = record.get("features", {}) | |
| table, cols = doc["table"], list(doc["table"]) | |
| rows = list(dict.fromkeys(r for col in table.values() for r in col)) | |
| md = ["| | " + " | ".join(cols) + " |", "|---" * (len(cols) + 1) + "|"] | |
| for row in rows: | |
| cells = [str(table[c].get(row, "")) for c in cols] | |
| md.append(f"| {row} | " + " | ".join(cells) + " |") | |
| notes = [] | |
| if feats.get("has_non_numeric_values"): | |
| notes.append( | |
| "non-numeric cells (the upstream cleaner sometimes folds first-row " | |
| "values into the column-name string)" | |
| ) | |
| if feats.get("has_duplicate_columns"): | |
| notes.append("duplicate column headers not fully disambiguated by cleaning") | |
| warning = "" | |
| if notes: | |
| warning = ( | |
| "_Note: this record's `features` flags it as having " | |
| + " and ".join(notes) | |
| + ". The table below is exactly what the model sees, artefacts and all._\n\n" | |
| ) | |
| return ( | |
| warning | |
| + "**Pre-text:** \n" + doc["pre_text"] | |
| + "\n\n**Table:** \n" + "\n".join(md) | |
| + "\n\n**Post-text:** \n" + doc["post_text"] | |
| ) | |
| TOOL_CALL_RE = re.compile( | |
| r"<tool_call>\s*<function=([^>]+)>\s*(.*?)\s*</function>\s*</tool_call>", | |
| re.DOTALL, | |
| ) | |
| PARAM_RE = re.compile(r"<parameter=([^>]+)>\s*(.*?)\s*</parameter>", re.DOTALL) | |
| def parse_tool_calls(text): | |
| out = [] | |
| for m in TOOL_CALL_RE.finditer(text): | |
| params = {p.group(1).strip(): p.group(2).strip() for p in PARAM_RE.finditer(m.group(2))} | |
| out.append({"name": m.group(1).strip(), "arguments": params}) | |
| return out | |
| _BIN = (ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Pow) | |
| _UN = (ast.UAdd, ast.USub) | |
| def _eval_node(node): | |
| if isinstance(node, ast.Constant): | |
| if isinstance(node.value, bool) or not isinstance(node.value, (int, float)): | |
| raise ValueError(f"non-numeric literal: {node.value!r}") | |
| return float(node.value) | |
| if isinstance(node, ast.UnaryOp) and isinstance(node.op, _UN): | |
| v = _eval_node(node.operand) | |
| return -v if isinstance(node.op, ast.USub) else +v | |
| if isinstance(node, ast.BinOp) and isinstance(node.op, _BIN): | |
| l, r = _eval_node(node.left), _eval_node(node.right) | |
| if isinstance(node.op, ast.Add): return l + r | |
| if isinstance(node.op, ast.Sub): return l - r | |
| if isinstance(node.op, ast.Mult): return l * r | |
| if isinstance(node.op, ast.Div): return l / r | |
| return l ** r | |
| raise ValueError(f"disallowed construct: {type(node).__name__}") | |
| def calculate(expression): | |
| cleaned = str(expression).replace(",", "").strip() | |
| return round(float(_eval_node(ast.parse(cleaned, mode="eval").body)), 5) | |
| _dataset_path = hf_hub_download(repo_id=DATASET_REPO, filename=DATASET_FILENAME, repo_type="dataset") | |
| with open(_dataset_path) as _f: | |
| DATASET = json.load(_f) | |
| DEV_RECORDS = {r["id"]: r for r in DATASET["dev"]} | |
| DEV_IDS = sorted(DEV_RECORDS.keys()) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| IM_END_ID = tokenizer.convert_tokens_to_ids("<|im_end|>") | |
| _model = None | |
| def _ensure_model(): | |
| global _model | |
| if _model is None: | |
| _model = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_ID, dtype=torch.bfloat16, device_map="cuda" | |
| ) | |
| _model.eval() | |
| return _model | |
| def _run_turn_gpu(messages): | |
| """One question turn until submit_answer or budget exhausts. Runs on ZeroGPU.""" | |
| model = _ensure_model() | |
| for _ in range(6): | |
| prompt = tokenizer.apply_chat_template( | |
| messages, add_generation_prompt=True, enable_thinking=False, tokenize=False | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| do_sample=False, | |
| eos_token_id=IM_END_ID, | |
| pad_token_id=IM_END_ID, | |
| ) | |
| text = tokenizer.decode(out[0, inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| messages.append({"role": "assistant", "content": text}) | |
| calls = parse_tool_calls(text) | |
| if not calls: | |
| messages.append( | |
| {"role": "user", "content": "Please call submit_answer with the final answer for the current question."} | |
| ) | |
| continue | |
| for c in calls: | |
| if c["name"] == "submit_answer": | |
| return c["arguments"] | |
| if c["name"] == "calculate": | |
| result = calculate(c["arguments"]["expression"]) | |
| messages.append({"role": "tool", "content": str(result)}) | |
| raise RuntimeError("no submit_answer after max_iterations") | |
| def grade(pred_value, pred_unit, gold): | |
| """Match a predicted answer against the gold executed answer. Returns (correct, hint).""" | |
| if isinstance(gold, str): | |
| if pred_unit != "yes_no": | |
| return False, "expected yes_no" | |
| return str(pred_value).strip().lower() == gold.strip().lower(), "" | |
| try: | |
| v = float(pred_value) | |
| except (TypeError, ValueError): | |
| return False, "non-numeric" | |
| if pred_unit == "percent": | |
| v = v / 100 | |
| elif pred_unit not in ("fraction", "count", "absolute"): | |
| return False, f"unknown unit {pred_unit!r}" | |
| return abs(round(v, 5) - float(gold)) < 1e-9, "" | |
| def replay_record(record_id, progress=gr.Progress()): | |
| if not record_id: | |
| return "Pick a record from the dropdown.", None, "—" | |
| record = DEV_RECORDS[record_id] | |
| doc_md = render_document_markdown(record) | |
| sys_msg = { | |
| "role": "system", | |
| "content": SYSTEM_PROMPT_PREFIX + render_document( | |
| record["doc"]["pre_text"], record["doc"]["table"], record["doc"]["post_text"] | |
| ), | |
| } | |
| messages = [sys_msg] | |
| rows = [] | |
| questions = record["dialogue"]["conv_questions"] | |
| golds = record["dialogue"]["executed_answers"] | |
| n_correct = 0 | |
| for i, q in enumerate(progress.tqdm(questions, desc="Running turns")): | |
| messages.append({"role": "user", "content": q}) | |
| try: | |
| answer = _run_turn_gpu(messages) | |
| value, unit = answer.get("value", ""), answer.get("unit", "") | |
| predicted = f"{value} ({unit})" | |
| ok, hint = grade(value, unit, golds[i]) | |
| if ok: | |
| n_correct += 1 | |
| verdict = "correct" if ok else f"wrong{(' — ' + hint) if hint else ''}" | |
| except Exception as e: | |
| predicted = f"FAILED: {e}" | |
| verdict = "error" | |
| rows.append([i, q, predicted, str(golds[i]), verdict]) | |
| summary = f"{n_correct}/{len(questions)} correct on this record." | |
| return doc_md, rows, summary | |
| def chat_load_doc(record_id): | |
| if not record_id: | |
| return "Pick a record from the dropdown.", [] | |
| record = DEV_RECORDS[record_id] | |
| return render_document_markdown(record), [] | |
| def chat_respond(user_msg, history, record_id): | |
| if not record_id: | |
| return history + [ | |
| {"role": "user", "content": user_msg}, | |
| {"role": "assistant", "content": "Please pick a document from the dropdown first."}, | |
| ], "" | |
| if not user_msg or not user_msg.strip(): | |
| return history, "" | |
| record = DEV_RECORDS[record_id] | |
| sys_content = SYSTEM_PROMPT_PREFIX + render_document( | |
| record["doc"]["pre_text"], record["doc"]["table"], record["doc"]["post_text"] | |
| ) | |
| messages = [{"role": "system", "content": sys_content}] | |
| for h in history: | |
| if h["role"] in ("user", "assistant"): | |
| messages.append({"role": h["role"], "content": h["content"]}) | |
| messages.append({"role": "user", "content": user_msg}) | |
| try: | |
| answer = _run_turn_gpu(messages) | |
| value, unit = answer.get("value", ""), answer.get("unit", "") | |
| reply = f"**{value}** _(unit: {unit})_" | |
| except Exception as e: | |
| reply = f"_Failed: {e}_" | |
| new_history = history + [ | |
| {"role": "user", "content": user_msg}, | |
| {"role": "assistant", "content": reply}, | |
| ] | |
| return new_history, "" | |
| with gr.Blocks(title="ConvFinQA agent") as demo: | |
| gr.Markdown( | |
| "# ConvFinQA agent\n" | |
| "Fine-tuned [Qwen/Qwen3.5-4B](https://huggingface.co/Qwen/Qwen3.5-4B) " | |
| "(adapter at [sharick008/convfinqa-qwen3.5-4b-lora](https://huggingface.co/sharick008/convfinqa-qwen3.5-4b-lora), " | |
| "merged at [sharick008/convfinqa-qwen3.5-4b](https://huggingface.co/sharick008/convfinqa-qwen3.5-4b)) " | |
| "answering multi-turn numerical questions over single-page financial documents from " | |
| "[ConvFinQA](https://aclanthology.org/2022.emnlp-main.421/) (Chen et al., 2022)." | |
| "\n\n" | |
| "First call wakes the GPU and downloads weights — expect ~60s. Subsequent turns are fast." | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Replay"): | |
| gr.Markdown( | |
| "Pick one of the 421 dev records. The agent runs through every turn of " | |
| "the original conversation and shows the predicted answer next to the " | |
| "dataset's gold." | |
| ) | |
| replay_dropdown = gr.Dropdown( | |
| DEV_IDS, label=f"Dev record ({len(DEV_IDS)} available)", value=DEV_IDS[0] | |
| ) | |
| replay_run_btn = gr.Button("Run agent through all turns", variant="primary") | |
| replay_summary = gr.Markdown() | |
| replay_results = gr.Dataframe( | |
| headers=["turn", "question", "predicted", "gold", "verdict"], | |
| wrap=True, | |
| column_widths=["5%", "45%", "15%", "15%", "20%"], | |
| ) | |
| with gr.Accordion("Document", open=False): | |
| replay_doc = gr.Markdown() | |
| replay_run_btn.click( | |
| replay_record, | |
| inputs=[replay_dropdown], | |
| outputs=[replay_doc, replay_results, replay_summary], | |
| ) | |
| with gr.Tab("Chat"): | |
| gr.Markdown( | |
| "Pick a document, then ask any numerical question over it. Answers come " | |
| "back as `value (unit)` from the agent's `submit_answer` call." | |
| ) | |
| chat_dropdown = gr.Dropdown(DEV_IDS, label="Pick a document", value=DEV_IDS[0]) | |
| with gr.Accordion("Document", open=True): | |
| chat_doc = gr.Markdown() | |
| chatbot = gr.Chatbot(label="Conversation", type="messages", height=320) | |
| with gr.Row(): | |
| chat_input = gr.Textbox( | |
| placeholder="Ask a numerical question about the document above...", | |
| show_label=False, | |
| scale=8, | |
| ) | |
| chat_send = gr.Button("Send", scale=1, variant="primary") | |
| chat_clear = gr.Button("Clear", scale=1) | |
| chat_dropdown.change(chat_load_doc, [chat_dropdown], [chat_doc, chatbot]) | |
| chat_input.submit(chat_respond, [chat_input, chatbot, chat_dropdown], [chatbot, chat_input]) | |
| chat_send.click(chat_respond, [chat_input, chatbot, chat_dropdown], [chatbot, chat_input]) | |
| chat_clear.click(lambda: [], None, chatbot) | |
| demo.load(chat_load_doc, [chat_dropdown], [chat_doc, chatbot]) | |
| if __name__ == "__main__": | |
| demo.launch() | |