Spaces:
Running
Running
| """Task definitions with PIL-generated synthetic document images and ground truth.""" | |
| from __future__ import annotations | |
| import base64 | |
| import io | |
| import math | |
| import random | |
| try: | |
| from PIL import Image, ImageDraw, ImageFont, ImageFilter | |
| PIL_AVAILABLE = True | |
| except ImportError: | |
| PIL_AVAILABLE = False | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def _b64_png(img) -> str: | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| return base64.b64encode(buf.getvalue()).decode() | |
| def _try_font(size: int = 14): | |
| if not PIL_AVAILABLE: | |
| return None | |
| try: | |
| return ImageFont.truetype("arial.ttf", size) | |
| except Exception: | |
| try: | |
| return ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", size) | |
| except Exception: | |
| return ImageFont.load_default() | |
| def _add_noise(img, intensity: float = 30.0): | |
| """Add Gaussian-like noise to a PIL image.""" | |
| import random as rnd | |
| pixels = img.load() | |
| w, h = img.size | |
| for x in range(w): | |
| for y in range(h): | |
| r, g, b = pixels[x, y] | |
| noise = int(rnd.gauss(0, intensity)) | |
| pixels[x, y] = ( | |
| max(0, min(255, r + noise)), | |
| max(0, min(255, g + noise)), | |
| max(0, min(255, b + noise)), | |
| ) | |
| return img | |
| def _rotate_image(img, degrees: float): | |
| return img.rotate(degrees, expand=True, fillcolor=(255, 255, 255)) | |
| # --------------------------------------------------------------------------- | |
| # Task 1 — Clean Printed Table | |
| # --------------------------------------------------------------------------- | |
| TASK1_DATA = { | |
| "headers": ["Product", "Q1 Sales", "Q2 Sales", "Q3 Sales", "Q4 Sales"], | |
| "rows": [ | |
| ["Widget A", "1200", "1350", "1100", "1500"], | |
| ["Widget B", "800", "950", "870", "920"], | |
| ["Widget C", "2100", "2300", "2050", "2400"], | |
| ["Widget D", "450", "480", "510", "490"], | |
| ["Widget E", "3200", "3100", "3300", "3500"], | |
| ["Widget F", "670", "720", "690", "750"], | |
| ["Widget G", "1800", "1950", "1870", "2000"], | |
| ], | |
| "kpis": { | |
| "total_q1": "10220", | |
| "total_q4": "13060", | |
| "best_product": "Widget E", | |
| }, | |
| } | |
| def _md_from_data(data: dict) -> str: | |
| headers = data["headers"] | |
| rows = data["rows"] | |
| sep = "| " + " | ".join(["---"] * len(headers)) + " |" | |
| lines = ["| " + " | ".join(headers) + " |", sep] | |
| for row in rows: | |
| lines.append("| " + " | ".join(row) + " |") | |
| return "\n".join(lines) | |
| def generate_task1() -> dict: | |
| gt_md = _md_from_data(TASK1_DATA) | |
| gt_kpis = TASK1_DATA["kpis"] | |
| if not PIL_AVAILABLE: | |
| return { | |
| "image_b64": None, | |
| "text_hint": gt_md, | |
| "gt_md": gt_md, | |
| "gt_kpis": gt_kpis, | |
| "gt_cells": {}, | |
| "max_steps": 5, | |
| "task_id": 1, | |
| } | |
| headers = TASK1_DATA["headers"] | |
| rows = TASK1_DATA["rows"] | |
| col_widths = [110, 80, 80, 80, 80] | |
| row_h = 28 | |
| pad = 20 | |
| W = sum(col_widths) + pad * 2 | |
| H = row_h * (len(rows) + 2) + pad * 2 | |
| img = Image.new("RGB", (W, H), (255, 255, 255)) | |
| draw = ImageDraw.Draw(img) | |
| font = _try_font(13) | |
| hfont = _try_font(13) | |
| # Title | |
| draw.text((pad, pad), "Sales Report — Annual Summary", fill=(30, 30, 30), font=hfont) | |
| y = pad + row_h | |
| x0 = pad | |
| # Header row | |
| for i, (h, cw) in enumerate(zip(headers, col_widths)): | |
| x = x0 + sum(col_widths[:i]) | |
| draw.rectangle([x, y, x + cw, y + row_h], fill=(50, 100, 180)) | |
| draw.text((x + 4, y + 6), h, fill=(255, 255, 255), font=font) | |
| y += row_h | |
| # Data rows | |
| for ri, row in enumerate(rows): | |
| fill = (240, 245, 255) if ri % 2 == 0 else (255, 255, 255) | |
| for i, (cell, cw) in enumerate(zip(row, col_widths)): | |
| x = x0 + sum(col_widths[:i]) | |
| draw.rectangle([x, y, x + cw, y + row_h], fill=fill, outline=(200, 200, 200)) | |
| draw.text((x + 4, y + 7), cell, fill=(20, 20, 20), font=font) | |
| y += row_h | |
| # KPI summary | |
| y += 5 | |
| draw.text((pad, y), f"Total Q1: {gt_kpis['total_q1']} | Total Q4: {gt_kpis['total_q4']} | Best: {gt_kpis['best_product']}", fill=(80, 80, 80), font=font) | |
| return { | |
| "image_b64": _b64_png(img), | |
| "text_hint": gt_md, # clean hint for task 1 | |
| "gt_md": gt_md, | |
| "gt_kpis": gt_kpis, | |
| "gt_cells": {}, | |
| "max_steps": 5, | |
| "task_id": 1, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Task 2 — Noisy Financial Statement | |
| # --------------------------------------------------------------------------- | |
| TASK2_DATA = { | |
| "headers": ["Metric", "FY2022", "FY2023", "FY2024", "YoY%"], | |
| "rows": [ | |
| ["Revenue", "$4,200K", "$5,100K", "$6,300K", "+23.5%"], | |
| ["Gross Profit", "$1,890K", "$2,346K", "$3,024K", "+28.9%"], | |
| ["EBITDA", "$840K", "$1,020K", "$1,386K", "+35.9%"], | |
| ["Net Income", "$504K", "$663K", "$945K", "+42.5%"], | |
| ["EPS", "$1.26", "$1.66", "$2.36", "+42.2%"], | |
| ], | |
| "kpis": { | |
| "revenue_fy2024": "$6,300K", | |
| "ebitda_margin": "22.0%", | |
| "yoy_growth_pct": "+23.5%", | |
| "net_income_fy2024": "$945K", | |
| }, | |
| "gt_cells": { | |
| "(0,0)": True, "(0,1)": True, "(0,2)": True, | |
| "(1,0)": True, "(1,3)": True, | |
| "(2,0)": True, "(2,3)": True, | |
| }, | |
| } | |
| def generate_task2() -> dict: | |
| gt_md = _md_from_data(TASK2_DATA) | |
| gt_kpis = TASK2_DATA["kpis"] | |
| gt_cells = TASK2_DATA["gt_cells"] | |
| # Noisy text hint — simulate OCR errors | |
| noisy_hint = gt_md.replace("$", "S").replace("%", "°/o").replace("K", "lK").replace("+", "÷") | |
| if not PIL_AVAILABLE: | |
| return { | |
| "image_b64": None, | |
| "text_hint": noisy_hint, | |
| "gt_md": gt_md, | |
| "gt_kpis": gt_kpis, | |
| "gt_cells": gt_cells, | |
| "max_steps": 10, | |
| "task_id": 2, | |
| } | |
| headers = TASK2_DATA["headers"] | |
| rows = TASK2_DATA["rows"] | |
| col_widths = [120, 80, 80, 80, 70] | |
| row_h = 30 | |
| pad = 20 | |
| W = sum(col_widths) + pad * 2 | |
| H = row_h * (len(rows) + 3) + pad * 2 | |
| img = Image.new("RGB", (W, H), (252, 252, 248)) | |
| draw = ImageDraw.Draw(img) | |
| font = _try_font(13) | |
| draw.text((pad, pad), "Annual Financial Statement — CONFIDENTIAL", fill=(20, 20, 80), font=font) | |
| y = pad + row_h | |
| x0 = pad | |
| # Two-level header simulation | |
| draw.text((x0, y), "Financials (USD)", fill=(100, 100, 100), font=font) | |
| y += row_h // 2 | |
| for i, (h, cw) in enumerate(zip(headers, col_widths)): | |
| x = x0 + sum(col_widths[:i]) | |
| draw.rectangle([x, y, x + cw, y + row_h], fill=(30, 60, 120)) | |
| draw.text((x + 4, y + 7), h, fill=(255, 255, 255), font=font) | |
| y += row_h | |
| for ri, row in enumerate(rows): | |
| fill = (245, 248, 255) if ri % 2 == 0 else (255, 255, 255) | |
| for i, (cell, cw) in enumerate(zip(row, col_widths)): | |
| x = x0 + sum(col_widths[:i]) | |
| draw.rectangle([x, y, x + cw, y + row_h], fill=fill, outline=(180, 180, 200)) | |
| color = (0, 120, 0) if "+" in cell else (180, 20, 20) if "-" in cell else (20, 20, 20) | |
| draw.text((x + 4, y + 8), cell, fill=color, font=font) | |
| y += row_h | |
| # Add noise + slight rotation | |
| img = _add_noise(img, intensity=18) | |
| img = img.filter(ImageFilter.GaussianBlur(radius=0.4)) | |
| img = _rotate_image(img, -1.5) | |
| return { | |
| "image_b64": _b64_png(img), | |
| "text_hint": noisy_hint, | |
| "gt_md": gt_md, | |
| "gt_kpis": gt_kpis, | |
| "gt_cells": gt_cells, | |
| "max_steps": 10, | |
| "task_id": 2, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Task 3 — Degraded Multi-Table Report | |
| # --------------------------------------------------------------------------- | |
| TASK3_DATA = { | |
| "table1": { | |
| "headers": ["KPI", "Target", "Actual", "Variance"], | |
| "rows": [ | |
| ["Customer Satisfaction", "90%", "87%", "-3%"], | |
| ["Ticket Resolution Rate", "95%", "93%", "-2%"], | |
| ["Avg Handle Time", "4.5min", "5.1min", "+0.6min"], | |
| ["First Call Resolution", "80%", "78%", "-2%"], | |
| ], | |
| "kpis": { | |
| "csat_actual": "87%", | |
| "ticket_resolution": "93%", | |
| "avg_handle_time": "5.1min", | |
| }, | |
| }, | |
| "table2": { | |
| "headers": ["Category", "Budget", "Spent", "Remaining"], | |
| "rows": [ | |
| ["Operations", "$500K", "$472K", "$28K"], | |
| ["Marketing", "$200K", "$198K", "$2K"], | |
| ["R&D", "$300K", "$275K", "$25K"], | |
| ["Support", "$150K", "$163K", "-$13K"], | |
| ], | |
| "kpis": { | |
| "operations_budget": "$500K", | |
| "support_overspend": "-$13K", | |
| "total_remaining": "$42K", | |
| }, | |
| }, | |
| } | |
| def generate_task3() -> dict: | |
| t1 = TASK3_DATA["table1"] | |
| t2 = TASK3_DATA["table2"] | |
| gt_md1 = _md_from_data(t1) | |
| gt_md2 = _md_from_data(t2) | |
| gt_md = gt_md1 + "\n---\n" + gt_md2 | |
| gt_kpis = { | |
| "table1": t1["kpis"], | |
| "table2": t2["kpis"], | |
| } | |
| # Very noisy hint | |
| noisy = (gt_md | |
| .replace("$", "S") | |
| .replace("%", "°/o") | |
| .replace("K", "lK") | |
| .replace("-", "—") | |
| .replace("+", "÷")) | |
| if not PIL_AVAILABLE: | |
| return { | |
| "image_b64": None, | |
| "text_hint": noisy, | |
| "gt_md": gt_md, | |
| "gt_kpis": gt_kpis, | |
| "gt_cells": {}, | |
| "max_steps": 15, | |
| "task_id": 3, | |
| } | |
| col_widths1 = [180, 80, 80, 80] | |
| col_widths2 = [120, 80, 80, 80] | |
| row_h = 28 | |
| pad = 20 | |
| W = max(sum(col_widths1), sum(col_widths2)) + pad * 2 | |
| H = row_h * (len(t1["rows"]) + len(t2["rows"]) + 7) + pad * 2 | |
| img = Image.new("RGB", (W, H), (248, 245, 240)) | |
| draw = ImageDraw.Draw(img) | |
| font = _try_font(12) | |
| y = pad | |
| def draw_table(data, col_widths, title_str): | |
| nonlocal y | |
| draw.text((pad, y), title_str, fill=(50, 30, 10), font=font) | |
| y += row_h | |
| x0 = pad | |
| for i, (h, cw) in enumerate(zip(data["headers"], col_widths)): | |
| x = x0 + sum(col_widths[:i]) | |
| draw.rectangle([x, y, x + cw, y + row_h], fill=(80, 50, 20)) | |
| draw.text((x + 3, y + 7), h, fill=(255, 240, 200), font=font) | |
| y += row_h | |
| for ri, row in enumerate(data["rows"]): | |
| fill = (250, 245, 235) if ri % 2 == 0 else (240, 235, 225) | |
| for i, (cell, cw) in enumerate(zip(row, col_widths)): | |
| x = x0 + sum(col_widths[:i]) | |
| draw.rectangle([x, y, x + cw, y + row_h], fill=fill, outline=(170, 150, 130)) | |
| draw.text((x + 3, y + 8), cell, fill=(30, 20, 10), font=font) | |
| y += row_h | |
| y += 10 | |
| draw_table(t1, col_widths1, "Table 1: Operational KPIs") | |
| draw.line([(pad, y), (W - pad, y)], fill=(100, 80, 60), width=2) | |
| y += 10 | |
| draw_table(t2, col_widths2, "Table 2: Financial Summary") | |
| # Heavy degradation | |
| img = _add_noise(img, intensity=35) | |
| img = img.filter(ImageFilter.GaussianBlur(radius=0.7)) | |
| img = _rotate_image(img, 3.5) | |
| return { | |
| "image_b64": _b64_png(img), | |
| "text_hint": noisy, | |
| "gt_md": gt_md, | |
| "gt_kpis": gt_kpis, | |
| "gt_cells": {}, | |
| "max_steps": 15, | |
| "task_id": 3, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Public registry | |
| # --------------------------------------------------------------------------- | |
| TASK_REGISTRY = { | |
| "clean_table": generate_task1, | |
| "noisy_financial": generate_task2, | |
| "degraded_report": generate_task3, | |
| } | |
| TASK_METADATA = [ | |
| { | |
| "id": "clean_table", | |
| "title": "Clean Printed Table", | |
| "difficulty": "easy", | |
| "description": "Extract a clean single-table sales report into Markdown + JSON KPIs.", | |
| "optimal_steps": 2, | |
| "max_steps": 5, | |
| }, | |
| { | |
| "id": "noisy_financial", | |
| "title": "Noisy Financial Statement", | |
| "difficulty": "medium", | |
| "description": "Extract a noisy multi-header financial table with merged context. Output Markdown + labeled KPIs. Optional per-cell confidence scoring.", | |
| "optimal_steps": 4, | |
| "max_steps": 10, | |
| }, | |
| { | |
| "id": "degraded_report", | |
| "title": "Degraded Multi-Table Report", | |
| "difficulty": "hard", | |
| "description": "Extract two tables from a heavily degraded rotated document. Output separate Markdowns + cross-table KPI JSON.", | |
| "optimal_steps": 8, | |
| "max_steps": 15, | |
| }, | |
| ] | |