Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import csv | |
| import importlib | |
| import json | |
| import os | |
| import secrets | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| gr = importlib.import_module("gradio") | |
| SPACE_ROOT = Path(__file__).resolve().parent | |
| DATA_DIR = SPACE_ROOT / "data" | |
| RESPONSES_DIR = SPACE_ROOT / "responses" | |
| TASK_A_PATH = DATA_DIR / "task_a_items.jsonl" | |
| TASK_B_PATH = DATA_DIR / "task_b_pairs.jsonl" | |
| TASK_A_RESPONSES = RESPONSES_DIR / "task_a_responses.jsonl" | |
| TASK_B_RESPONSES = RESPONSES_DIR / "task_b_responses.jsonl" | |
| TASK_A_CSV = RESPONSES_DIR / "task_a_responses.csv" | |
| TASK_B_CSV = RESPONSES_DIR / "task_b_responses.csv" | |
| TASK_A_LABELS = ["Correct", "Ambiguous", "Incorrect"] | |
| TASK_B_LABELS = ["Plausible", "Implausible", "Unclear"] | |
| def load_jsonl(path: Path) -> list[dict[str, object]]: | |
| if not path.exists(): | |
| return [] | |
| with path.open() as handle: | |
| return [json.loads(line) for line in handle if line.strip()] | |
| TASK_A_ITEMS = load_jsonl(TASK_A_PATH) | |
| TASK_B_ITEMS = load_jsonl(TASK_B_PATH) | |
| def ensure_dirs() -> None: | |
| RESPONSES_DIR.mkdir(parents=True, exist_ok=True) | |
| def infer_modality(question_id: str) -> str: | |
| return question_id.split("_", 1)[0] | |
| def candidate_image_roots() -> list[Path]: | |
| roots = [] | |
| env_root = os.environ.get("STRUCTVIZ_IMAGE_ROOT") | |
| if env_root: | |
| roots.append(Path(env_root)) | |
| roots.extend( | |
| [ | |
| DATA_DIR / "images", | |
| SPACE_ROOT / "benchmark" / "rendered" / "benchmark" / "rendered", | |
| SPACE_ROOT / "benchmark" / "rendered", | |
| ] | |
| ) | |
| return roots | |
| def resolve_image_path(question_id: str, viz_type: str) -> str | None: | |
| modality = infer_modality(question_id) | |
| filename = f"{question_id}_{viz_type}.png" | |
| for root in candidate_image_roots(): | |
| candidate = root / modality / filename | |
| if candidate.exists(): | |
| return str(candidate) | |
| return None | |
| def resolve_task_a_image(item: dict[str, object]) -> str | None: | |
| preset = item.get("image_path") | |
| if isinstance(preset, str) and preset: | |
| candidate = SPACE_ROOT / preset | |
| if candidate.exists(): | |
| return str(candidate) | |
| return resolve_image_path(str(item["question_id"]), str(item["viz_type"])) | |
| def resolve_task_b_images(item: dict[str, object]) -> tuple[str | None, str | None]: | |
| preset_a = item.get("image_a_path") | |
| preset_b = item.get("image_b_path") | |
| image_a: str | None = None | |
| image_b: str | None = None | |
| if isinstance(preset_a, str) and preset_a: | |
| candidate_a = SPACE_ROOT / preset_a | |
| if candidate_a.exists(): | |
| image_a = str(candidate_a) | |
| if isinstance(preset_b, str) and preset_b: | |
| candidate_b = SPACE_ROOT / preset_b | |
| if candidate_b.exists(): | |
| image_b = str(candidate_b) | |
| if image_a is None: | |
| image_a = resolve_image_path(str(item["question_id"]), str(item["viz_a"])) | |
| if image_b is None: | |
| image_b = resolve_image_path(str(item["question_id"]), str(item["viz_b"])) | |
| return image_a, image_b | |
| def save_record(record: dict[str, object], jsonl_path: Path, csv_path: Path) -> None: | |
| ensure_dirs() | |
| with jsonl_path.open("a") as handle: | |
| handle.write(json.dumps(record, ensure_ascii=True) + "\n") | |
| write_header = not csv_path.exists() | |
| with csv_path.open("a", newline="") as handle: | |
| writer = csv.DictWriter(handle, fieldnames=list(record.keys())) | |
| if write_header: | |
| writer.writeheader() | |
| writer.writerow(record) | |
| def progress_text(index: int, total: int) -> str: | |
| if total == 0: | |
| return "No items loaded" | |
| current = min(index + 1, total) | |
| return f"Item {current} / {total}" | |
| def image_status_text(image_path: str | None, label: str) -> str: | |
| if image_path: | |
| return f"{label}: image loaded" | |
| return f"{label}: image missing in this Space bundle; use question/answer metadata only or upload assets later" | |
| def task_a_payload(index): | |
| if not TASK_A_ITEMS: | |
| return None, "No Task A items found.", "", "", "", "", "", "", "" | |
| item = TASK_A_ITEMS[index % len(TASK_A_ITEMS)] | |
| image_path = resolve_task_a_image(item) | |
| return ( | |
| image_path, | |
| image_status_text(image_path, "Task A"), | |
| progress_text(index, len(TASK_A_ITEMS)), | |
| str(item["question_id"]), | |
| str(item["question"]), | |
| str(item["answer"]), | |
| f"{item['modality']} / {item['difficulty']} / {item['source']}", | |
| str(item["viz_type"]), | |
| "", | |
| ) | |
| def task_b_payload(index): | |
| if not TASK_B_ITEMS: | |
| return None, None, "No Task B items found.", "", "", "", "", "", "" | |
| item = TASK_B_ITEMS[index % len(TASK_B_ITEMS)] | |
| image_a, image_b = resolve_task_b_images(item) | |
| return ( | |
| image_a, | |
| image_b, | |
| f"{image_status_text(image_a, 'Task B image A')} | {image_status_text(image_b, 'Task B image B')}", | |
| progress_text(index, len(TASK_B_ITEMS)), | |
| str(item["question_id"]), | |
| str(item["question"]), | |
| str(item["answer"]), | |
| f"A: {item['viz_a']} (EM {item['em_a']}) | B: {item['viz_b']} (EM {item['em_b']})", | |
| "", | |
| ) | |
| def start_session(evaluator_name, current_session): | |
| cleaned = evaluator_name.strip() | |
| if not cleaned: | |
| cleaned = "anonymous" | |
| if current_session: | |
| return current_session, f"Session ready: {cleaned} ({current_session})" | |
| session_id = secrets.token_hex(8) | |
| return session_id, f"Session ready: {cleaned} ({session_id})" | |
| def submit_task_a(index, evaluator_name, session_id, rating, notes): | |
| if not TASK_A_ITEMS: | |
| return index, "No Task A items available.", None, "", "", "", "", "", "", "", "" | |
| if not evaluator_name.strip() or not session_id.strip(): | |
| raise gr.Error("Enter evaluator name and click Start Session first.") | |
| if rating not in TASK_A_LABELS: | |
| raise gr.Error("Select a Task A rating before submitting.") | |
| item = TASK_A_ITEMS[index % len(TASK_A_ITEMS)] | |
| record = { | |
| "timestamp": datetime.now(timezone.utc).isoformat(), | |
| "session_id": session_id, | |
| "evaluator": evaluator_name.strip(), | |
| "task": "task_a", | |
| "item_index": index, | |
| "question_id": item["question_id"], | |
| "question": item["question"], | |
| "answer": item["answer"], | |
| "modality": item["modality"], | |
| "difficulty": item["difficulty"], | |
| "source": item["source"], | |
| "viz_type": item["viz_type"], | |
| "rating": rating, | |
| "notes": notes.strip(), | |
| } | |
| save_record(record, TASK_A_RESPONSES, TASK_A_CSV) | |
| next_index = (index + 1) % len(TASK_A_ITEMS) | |
| image_path, image_status, progress, qid, question, answer, meta, viz_type, _ = ( | |
| task_a_payload(next_index) | |
| ) | |
| return ( | |
| next_index, | |
| f"Saved Task A response for {record['question_id']}.", | |
| image_path, | |
| image_status, | |
| progress, | |
| qid, | |
| question, | |
| answer, | |
| meta, | |
| viz_type, | |
| "", | |
| ) | |
| def submit_task_b(index, evaluator_name, session_id, rating, notes): | |
| if not TASK_B_ITEMS: | |
| return ( | |
| index, | |
| "No Task B items available.", | |
| None, | |
| None, | |
| "", | |
| "", | |
| "", | |
| "", | |
| "", | |
| "", | |
| "", | |
| ) | |
| if not evaluator_name.strip() or not session_id.strip(): | |
| raise gr.Error("Enter evaluator name and click Start Session first.") | |
| if rating not in TASK_B_LABELS: | |
| raise gr.Error("Select a Task B rating before submitting.") | |
| item = TASK_B_ITEMS[index % len(TASK_B_ITEMS)] | |
| record = { | |
| "timestamp": datetime.now(timezone.utc).isoformat(), | |
| "session_id": session_id, | |
| "evaluator": evaluator_name.strip(), | |
| "task": "task_b", | |
| "item_index": index, | |
| "question_id": item["question_id"], | |
| "question": item["question"], | |
| "answer": item["answer"], | |
| "viz_a": item["viz_a"], | |
| "viz_b": item["viz_b"], | |
| "em_a": item["em_a"], | |
| "em_b": item["em_b"], | |
| "rating": rating, | |
| "notes": notes.strip(), | |
| } | |
| save_record(record, TASK_B_RESPONSES, TASK_B_CSV) | |
| next_index = (index + 1) % len(TASK_B_ITEMS) | |
| image_a, image_b, image_status, progress, qid, question, answer, meta, _ = ( | |
| task_b_payload(next_index) | |
| ) | |
| return ( | |
| next_index, | |
| f"Saved Task B response for {record['question_id']}.", | |
| image_a, | |
| image_b, | |
| image_status, | |
| progress, | |
| qid, | |
| question, | |
| answer, | |
| meta, | |
| "", | |
| ) | |
| with gr.Blocks(title="StructViz-Bench Human Evaluation") as demo: | |
| gr.Markdown( | |
| "# StructViz-Bench Human Evaluation\n" | |
| "Use Task A to verify answer correctness and Task B to judge whether " | |
| "visualization-sensitive failures look plausible. Responses are saved to " | |
| "local JSONL and CSV files in `responses/`." | |
| ) | |
| with gr.Row(): | |
| evaluator_name = gr.Textbox( | |
| label="Evaluator Name", placeholder="e.g. evaluator_1" | |
| ) | |
| session_id = gr.Textbox(label="Session ID", interactive=False) | |
| start_button = gr.Button("Start Session", variant="primary") | |
| session_status = gr.Markdown("Enter your name and start a session.") | |
| with gr.Tab("Task A - Answer Correctness"): | |
| task_a_index = gr.State(0) | |
| task_a_image = gr.Image(label="Visualization", type="filepath", height=420) | |
| task_a_image_status = gr.Markdown() | |
| task_a_progress = gr.Textbox(label="Progress", interactive=False) | |
| task_a_qid = gr.Textbox(label="Question ID", interactive=False) | |
| task_a_question = gr.Textbox(label="Question", interactive=False, lines=2) | |
| task_a_answer = gr.Textbox(label="Ground-Truth Answer", interactive=False) | |
| task_a_meta = gr.Textbox(label="Metadata", interactive=False) | |
| task_a_viz = gr.Textbox(label="Visualization Type", interactive=False) | |
| task_a_rating = gr.Radio(TASK_A_LABELS, label="Judgment") | |
| task_a_notes = gr.Textbox(label="Notes", lines=3) | |
| task_a_submit = gr.Button("Submit Task A", variant="primary") | |
| task_a_status = gr.Markdown() | |
| with gr.Tab("Task B - Sensitivity Plausibility"): | |
| task_b_index = gr.State(0) | |
| with gr.Row(): | |
| task_b_image_a = gr.Image( | |
| label="Visualization A", type="filepath", height=360 | |
| ) | |
| task_b_image_b = gr.Image( | |
| label="Visualization B", type="filepath", height=360 | |
| ) | |
| task_b_image_status = gr.Markdown() | |
| task_b_progress = gr.Textbox(label="Progress", interactive=False) | |
| task_b_qid = gr.Textbox(label="Question ID", interactive=False) | |
| task_b_question = gr.Textbox(label="Question", interactive=False, lines=2) | |
| task_b_answer = gr.Textbox(label="Ground-Truth Answer", interactive=False) | |
| task_b_meta = gr.Textbox(label="Pair Metadata", interactive=False) | |
| task_b_rating = gr.Radio(TASK_B_LABELS, label="Judgment") | |
| task_b_notes = gr.Textbox(label="Notes", lines=3) | |
| task_b_submit = gr.Button("Submit Task B", variant="primary") | |
| task_b_status = gr.Markdown() | |
| start_button.click( | |
| start_session, | |
| inputs=[evaluator_name, session_id], | |
| outputs=[session_id, session_status], | |
| api_name="start_session", | |
| ) | |
| demo.load( | |
| task_a_payload, | |
| inputs=task_a_index, | |
| outputs=[ | |
| task_a_image, | |
| task_a_image_status, | |
| task_a_progress, | |
| task_a_qid, | |
| task_a_question, | |
| task_a_answer, | |
| task_a_meta, | |
| task_a_viz, | |
| task_a_notes, | |
| ], | |
| api_name="load_task_a", | |
| ) | |
| demo.load( | |
| task_b_payload, | |
| inputs=task_b_index, | |
| outputs=[ | |
| task_b_image_a, | |
| task_b_image_b, | |
| task_b_image_status, | |
| task_b_progress, | |
| task_b_qid, | |
| task_b_question, | |
| task_b_answer, | |
| task_b_meta, | |
| task_b_notes, | |
| ], | |
| api_name="load_task_b", | |
| ) | |
| task_a_submit.click( | |
| submit_task_a, | |
| inputs=[task_a_index, evaluator_name, session_id, task_a_rating, task_a_notes], | |
| outputs=[ | |
| task_a_index, | |
| task_a_status, | |
| task_a_image, | |
| task_a_image_status, | |
| task_a_progress, | |
| task_a_qid, | |
| task_a_question, | |
| task_a_answer, | |
| task_a_meta, | |
| task_a_viz, | |
| task_a_notes, | |
| ], | |
| api_name="submit_task_a", | |
| ) | |
| task_b_submit.click( | |
| submit_task_b, | |
| inputs=[task_b_index, evaluator_name, session_id, task_b_rating, task_b_notes], | |
| outputs=[ | |
| task_b_index, | |
| task_b_status, | |
| task_b_image_a, | |
| task_b_image_b, | |
| task_b_image_status, | |
| task_b_progress, | |
| task_b_qid, | |
| task_b_question, | |
| task_b_answer, | |
| task_b_meta, | |
| task_b_notes, | |
| ], | |
| api_name="submit_task_b", | |
| ) | |
| demo.get_api_info = lambda: { | |
| "named_endpoints": {}, | |
| "unnamed_endpoints": {}, | |
| } | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", share=True, show_api=False) | |