from __future__ import annotations import csv import importlib import json import os import secrets from datetime import datetime, timezone from pathlib import Path gr = importlib.import_module("gradio") SPACE_ROOT = Path(__file__).resolve().parent DATA_DIR = SPACE_ROOT / "data" RESPONSES_DIR = SPACE_ROOT / "responses" TASK_A_PATH = DATA_DIR / "task_a_items.jsonl" TASK_B_PATH = DATA_DIR / "task_b_pairs.jsonl" TASK_A_RESPONSES = RESPONSES_DIR / "task_a_responses.jsonl" TASK_B_RESPONSES = RESPONSES_DIR / "task_b_responses.jsonl" TASK_A_CSV = RESPONSES_DIR / "task_a_responses.csv" TASK_B_CSV = RESPONSES_DIR / "task_b_responses.csv" TASK_A_LABELS = ["Correct", "Ambiguous", "Incorrect"] TASK_B_LABELS = ["Plausible", "Implausible", "Unclear"] def load_jsonl(path: Path) -> list[dict[str, object]]: if not path.exists(): return [] with path.open() as handle: return [json.loads(line) for line in handle if line.strip()] TASK_A_ITEMS = load_jsonl(TASK_A_PATH) TASK_B_ITEMS = load_jsonl(TASK_B_PATH) def ensure_dirs() -> None: RESPONSES_DIR.mkdir(parents=True, exist_ok=True) def infer_modality(question_id: str) -> str: return question_id.split("_", 1)[0] def candidate_image_roots() -> list[Path]: roots = [] env_root = os.environ.get("STRUCTVIZ_IMAGE_ROOT") if env_root: roots.append(Path(env_root)) roots.extend( [ DATA_DIR / "images", SPACE_ROOT / "benchmark" / "rendered" / "benchmark" / "rendered", SPACE_ROOT / "benchmark" / "rendered", ] ) return roots def resolve_image_path(question_id: str, viz_type: str) -> str | None: modality = infer_modality(question_id) filename = f"{question_id}_{viz_type}.png" for root in candidate_image_roots(): candidate = root / modality / filename if candidate.exists(): return str(candidate) return None def resolve_task_a_image(item: dict[str, object]) -> str | None: preset = item.get("image_path") if isinstance(preset, str) and preset: candidate = SPACE_ROOT / preset if candidate.exists(): return str(candidate) return resolve_image_path(str(item["question_id"]), str(item["viz_type"])) def resolve_task_b_images(item: dict[str, object]) -> tuple[str | None, str | None]: preset_a = item.get("image_a_path") preset_b = item.get("image_b_path") image_a: str | None = None image_b: str | None = None if isinstance(preset_a, str) and preset_a: candidate_a = SPACE_ROOT / preset_a if candidate_a.exists(): image_a = str(candidate_a) if isinstance(preset_b, str) and preset_b: candidate_b = SPACE_ROOT / preset_b if candidate_b.exists(): image_b = str(candidate_b) if image_a is None: image_a = resolve_image_path(str(item["question_id"]), str(item["viz_a"])) if image_b is None: image_b = resolve_image_path(str(item["question_id"]), str(item["viz_b"])) return image_a, image_b def save_record(record: dict[str, object], jsonl_path: Path, csv_path: Path) -> None: ensure_dirs() with jsonl_path.open("a") as handle: handle.write(json.dumps(record, ensure_ascii=True) + "\n") write_header = not csv_path.exists() with csv_path.open("a", newline="") as handle: writer = csv.DictWriter(handle, fieldnames=list(record.keys())) if write_header: writer.writeheader() writer.writerow(record) def progress_text(index: int, total: int) -> str: if total == 0: return "No items loaded" current = min(index + 1, total) return f"Item {current} / {total}" def image_status_text(image_path: str | None, label: str) -> str: if image_path: return f"{label}: image loaded" return f"{label}: image missing in this Space bundle; use question/answer metadata only or upload assets later" def task_a_payload(index): if not TASK_A_ITEMS: return None, "No Task A items found.", "", "", "", "", "", "", "" item = TASK_A_ITEMS[index % len(TASK_A_ITEMS)] image_path = resolve_task_a_image(item) return ( image_path, image_status_text(image_path, "Task A"), progress_text(index, len(TASK_A_ITEMS)), str(item["question_id"]), str(item["question"]), str(item["answer"]), f"{item['modality']} / {item['difficulty']} / {item['source']}", str(item["viz_type"]), "", ) def task_b_payload(index): if not TASK_B_ITEMS: return None, None, "No Task B items found.", "", "", "", "", "", "" item = TASK_B_ITEMS[index % len(TASK_B_ITEMS)] image_a, image_b = resolve_task_b_images(item) return ( image_a, image_b, f"{image_status_text(image_a, 'Task B image A')} | {image_status_text(image_b, 'Task B image B')}", progress_text(index, len(TASK_B_ITEMS)), str(item["question_id"]), str(item["question"]), str(item["answer"]), f"A: {item['viz_a']} (EM {item['em_a']}) | B: {item['viz_b']} (EM {item['em_b']})", "", ) def start_session(evaluator_name, current_session): cleaned = evaluator_name.strip() if not cleaned: cleaned = "anonymous" if current_session: return current_session, f"Session ready: {cleaned} ({current_session})" session_id = secrets.token_hex(8) return session_id, f"Session ready: {cleaned} ({session_id})" def submit_task_a(index, evaluator_name, session_id, rating, notes): if not TASK_A_ITEMS: return index, "No Task A items available.", None, "", "", "", "", "", "", "", "" if not evaluator_name.strip() or not session_id.strip(): raise gr.Error("Enter evaluator name and click Start Session first.") if rating not in TASK_A_LABELS: raise gr.Error("Select a Task A rating before submitting.") item = TASK_A_ITEMS[index % len(TASK_A_ITEMS)] record = { "timestamp": datetime.now(timezone.utc).isoformat(), "session_id": session_id, "evaluator": evaluator_name.strip(), "task": "task_a", "item_index": index, "question_id": item["question_id"], "question": item["question"], "answer": item["answer"], "modality": item["modality"], "difficulty": item["difficulty"], "source": item["source"], "viz_type": item["viz_type"], "rating": rating, "notes": notes.strip(), } save_record(record, TASK_A_RESPONSES, TASK_A_CSV) next_index = (index + 1) % len(TASK_A_ITEMS) image_path, image_status, progress, qid, question, answer, meta, viz_type, _ = ( task_a_payload(next_index) ) return ( next_index, f"Saved Task A response for {record['question_id']}.", image_path, image_status, progress, qid, question, answer, meta, viz_type, "", ) def submit_task_b(index, evaluator_name, session_id, rating, notes): if not TASK_B_ITEMS: return ( index, "No Task B items available.", None, None, "", "", "", "", "", "", "", ) if not evaluator_name.strip() or not session_id.strip(): raise gr.Error("Enter evaluator name and click Start Session first.") if rating not in TASK_B_LABELS: raise gr.Error("Select a Task B rating before submitting.") item = TASK_B_ITEMS[index % len(TASK_B_ITEMS)] record = { "timestamp": datetime.now(timezone.utc).isoformat(), "session_id": session_id, "evaluator": evaluator_name.strip(), "task": "task_b", "item_index": index, "question_id": item["question_id"], "question": item["question"], "answer": item["answer"], "viz_a": item["viz_a"], "viz_b": item["viz_b"], "em_a": item["em_a"], "em_b": item["em_b"], "rating": rating, "notes": notes.strip(), } save_record(record, TASK_B_RESPONSES, TASK_B_CSV) next_index = (index + 1) % len(TASK_B_ITEMS) image_a, image_b, image_status, progress, qid, question, answer, meta, _ = ( task_b_payload(next_index) ) return ( next_index, f"Saved Task B response for {record['question_id']}.", image_a, image_b, image_status, progress, qid, question, answer, meta, "", ) with gr.Blocks(title="StructViz-Bench Human Evaluation") as demo: gr.Markdown( "# StructViz-Bench Human Evaluation\n" "Use Task A to verify answer correctness and Task B to judge whether " "visualization-sensitive failures look plausible. Responses are saved to " "local JSONL and CSV files in `responses/`." ) with gr.Row(): evaluator_name = gr.Textbox( label="Evaluator Name", placeholder="e.g. evaluator_1" ) session_id = gr.Textbox(label="Session ID", interactive=False) start_button = gr.Button("Start Session", variant="primary") session_status = gr.Markdown("Enter your name and start a session.") with gr.Tab("Task A - Answer Correctness"): task_a_index = gr.State(0) task_a_image = gr.Image(label="Visualization", type="filepath", height=420) task_a_image_status = gr.Markdown() task_a_progress = gr.Textbox(label="Progress", interactive=False) task_a_qid = gr.Textbox(label="Question ID", interactive=False) task_a_question = gr.Textbox(label="Question", interactive=False, lines=2) task_a_answer = gr.Textbox(label="Ground-Truth Answer", interactive=False) task_a_meta = gr.Textbox(label="Metadata", interactive=False) task_a_viz = gr.Textbox(label="Visualization Type", interactive=False) task_a_rating = gr.Radio(TASK_A_LABELS, label="Judgment") task_a_notes = gr.Textbox(label="Notes", lines=3) task_a_submit = gr.Button("Submit Task A", variant="primary") task_a_status = gr.Markdown() with gr.Tab("Task B - Sensitivity Plausibility"): task_b_index = gr.State(0) with gr.Row(): task_b_image_a = gr.Image( label="Visualization A", type="filepath", height=360 ) task_b_image_b = gr.Image( label="Visualization B", type="filepath", height=360 ) task_b_image_status = gr.Markdown() task_b_progress = gr.Textbox(label="Progress", interactive=False) task_b_qid = gr.Textbox(label="Question ID", interactive=False) task_b_question = gr.Textbox(label="Question", interactive=False, lines=2) task_b_answer = gr.Textbox(label="Ground-Truth Answer", interactive=False) task_b_meta = gr.Textbox(label="Pair Metadata", interactive=False) task_b_rating = gr.Radio(TASK_B_LABELS, label="Judgment") task_b_notes = gr.Textbox(label="Notes", lines=3) task_b_submit = gr.Button("Submit Task B", variant="primary") task_b_status = gr.Markdown() start_button.click( start_session, inputs=[evaluator_name, session_id], outputs=[session_id, session_status], api_name="start_session", ) demo.load( task_a_payload, inputs=task_a_index, outputs=[ task_a_image, task_a_image_status, task_a_progress, task_a_qid, task_a_question, task_a_answer, task_a_meta, task_a_viz, task_a_notes, ], api_name="load_task_a", ) demo.load( task_b_payload, inputs=task_b_index, outputs=[ task_b_image_a, task_b_image_b, task_b_image_status, task_b_progress, task_b_qid, task_b_question, task_b_answer, task_b_meta, task_b_notes, ], api_name="load_task_b", ) task_a_submit.click( submit_task_a, inputs=[task_a_index, evaluator_name, session_id, task_a_rating, task_a_notes], outputs=[ task_a_index, task_a_status, task_a_image, task_a_image_status, task_a_progress, task_a_qid, task_a_question, task_a_answer, task_a_meta, task_a_viz, task_a_notes, ], api_name="submit_task_a", ) task_b_submit.click( submit_task_b, inputs=[task_b_index, evaluator_name, session_id, task_b_rating, task_b_notes], outputs=[ task_b_index, task_b_status, task_b_image_a, task_b_image_b, task_b_image_status, task_b_progress, task_b_qid, task_b_question, task_b_answer, task_b_meta, task_b_notes, ], api_name="submit_task_b", ) demo.get_api_info = lambda: { "named_endpoints": {}, "unnamed_endpoints": {}, } if __name__ == "__main__": demo.launch(server_name="0.0.0.0", share=True, show_api=False)