suanlab's picture
return structured empty api info
4d93cac
from __future__ import annotations
import csv
import importlib
import json
import os
import secrets
from datetime import datetime, timezone
from pathlib import Path
gr = importlib.import_module("gradio")
SPACE_ROOT = Path(__file__).resolve().parent
DATA_DIR = SPACE_ROOT / "data"
RESPONSES_DIR = SPACE_ROOT / "responses"
TASK_A_PATH = DATA_DIR / "task_a_items.jsonl"
TASK_B_PATH = DATA_DIR / "task_b_pairs.jsonl"
TASK_A_RESPONSES = RESPONSES_DIR / "task_a_responses.jsonl"
TASK_B_RESPONSES = RESPONSES_DIR / "task_b_responses.jsonl"
TASK_A_CSV = RESPONSES_DIR / "task_a_responses.csv"
TASK_B_CSV = RESPONSES_DIR / "task_b_responses.csv"
TASK_A_LABELS = ["Correct", "Ambiguous", "Incorrect"]
TASK_B_LABELS = ["Plausible", "Implausible", "Unclear"]
def load_jsonl(path: Path) -> list[dict[str, object]]:
if not path.exists():
return []
with path.open() as handle:
return [json.loads(line) for line in handle if line.strip()]
TASK_A_ITEMS = load_jsonl(TASK_A_PATH)
TASK_B_ITEMS = load_jsonl(TASK_B_PATH)
def ensure_dirs() -> None:
RESPONSES_DIR.mkdir(parents=True, exist_ok=True)
def infer_modality(question_id: str) -> str:
return question_id.split("_", 1)[0]
def candidate_image_roots() -> list[Path]:
roots = []
env_root = os.environ.get("STRUCTVIZ_IMAGE_ROOT")
if env_root:
roots.append(Path(env_root))
roots.extend(
[
DATA_DIR / "images",
SPACE_ROOT / "benchmark" / "rendered" / "benchmark" / "rendered",
SPACE_ROOT / "benchmark" / "rendered",
]
)
return roots
def resolve_image_path(question_id: str, viz_type: str) -> str | None:
modality = infer_modality(question_id)
filename = f"{question_id}_{viz_type}.png"
for root in candidate_image_roots():
candidate = root / modality / filename
if candidate.exists():
return str(candidate)
return None
def resolve_task_a_image(item: dict[str, object]) -> str | None:
preset = item.get("image_path")
if isinstance(preset, str) and preset:
candidate = SPACE_ROOT / preset
if candidate.exists():
return str(candidate)
return resolve_image_path(str(item["question_id"]), str(item["viz_type"]))
def resolve_task_b_images(item: dict[str, object]) -> tuple[str | None, str | None]:
preset_a = item.get("image_a_path")
preset_b = item.get("image_b_path")
image_a: str | None = None
image_b: str | None = None
if isinstance(preset_a, str) and preset_a:
candidate_a = SPACE_ROOT / preset_a
if candidate_a.exists():
image_a = str(candidate_a)
if isinstance(preset_b, str) and preset_b:
candidate_b = SPACE_ROOT / preset_b
if candidate_b.exists():
image_b = str(candidate_b)
if image_a is None:
image_a = resolve_image_path(str(item["question_id"]), str(item["viz_a"]))
if image_b is None:
image_b = resolve_image_path(str(item["question_id"]), str(item["viz_b"]))
return image_a, image_b
def save_record(record: dict[str, object], jsonl_path: Path, csv_path: Path) -> None:
ensure_dirs()
with jsonl_path.open("a") as handle:
handle.write(json.dumps(record, ensure_ascii=True) + "\n")
write_header = not csv_path.exists()
with csv_path.open("a", newline="") as handle:
writer = csv.DictWriter(handle, fieldnames=list(record.keys()))
if write_header:
writer.writeheader()
writer.writerow(record)
def progress_text(index: int, total: int) -> str:
if total == 0:
return "No items loaded"
current = min(index + 1, total)
return f"Item {current} / {total}"
def image_status_text(image_path: str | None, label: str) -> str:
if image_path:
return f"{label}: image loaded"
return f"{label}: image missing in this Space bundle; use question/answer metadata only or upload assets later"
def task_a_payload(index):
if not TASK_A_ITEMS:
return None, "No Task A items found.", "", "", "", "", "", "", ""
item = TASK_A_ITEMS[index % len(TASK_A_ITEMS)]
image_path = resolve_task_a_image(item)
return (
image_path,
image_status_text(image_path, "Task A"),
progress_text(index, len(TASK_A_ITEMS)),
str(item["question_id"]),
str(item["question"]),
str(item["answer"]),
f"{item['modality']} / {item['difficulty']} / {item['source']}",
str(item["viz_type"]),
"",
)
def task_b_payload(index):
if not TASK_B_ITEMS:
return None, None, "No Task B items found.", "", "", "", "", "", ""
item = TASK_B_ITEMS[index % len(TASK_B_ITEMS)]
image_a, image_b = resolve_task_b_images(item)
return (
image_a,
image_b,
f"{image_status_text(image_a, 'Task B image A')} | {image_status_text(image_b, 'Task B image B')}",
progress_text(index, len(TASK_B_ITEMS)),
str(item["question_id"]),
str(item["question"]),
str(item["answer"]),
f"A: {item['viz_a']} (EM {item['em_a']}) | B: {item['viz_b']} (EM {item['em_b']})",
"",
)
def start_session(evaluator_name, current_session):
cleaned = evaluator_name.strip()
if not cleaned:
cleaned = "anonymous"
if current_session:
return current_session, f"Session ready: {cleaned} ({current_session})"
session_id = secrets.token_hex(8)
return session_id, f"Session ready: {cleaned} ({session_id})"
def submit_task_a(index, evaluator_name, session_id, rating, notes):
if not TASK_A_ITEMS:
return index, "No Task A items available.", None, "", "", "", "", "", "", "", ""
if not evaluator_name.strip() or not session_id.strip():
raise gr.Error("Enter evaluator name and click Start Session first.")
if rating not in TASK_A_LABELS:
raise gr.Error("Select a Task A rating before submitting.")
item = TASK_A_ITEMS[index % len(TASK_A_ITEMS)]
record = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"session_id": session_id,
"evaluator": evaluator_name.strip(),
"task": "task_a",
"item_index": index,
"question_id": item["question_id"],
"question": item["question"],
"answer": item["answer"],
"modality": item["modality"],
"difficulty": item["difficulty"],
"source": item["source"],
"viz_type": item["viz_type"],
"rating": rating,
"notes": notes.strip(),
}
save_record(record, TASK_A_RESPONSES, TASK_A_CSV)
next_index = (index + 1) % len(TASK_A_ITEMS)
image_path, image_status, progress, qid, question, answer, meta, viz_type, _ = (
task_a_payload(next_index)
)
return (
next_index,
f"Saved Task A response for {record['question_id']}.",
image_path,
image_status,
progress,
qid,
question,
answer,
meta,
viz_type,
"",
)
def submit_task_b(index, evaluator_name, session_id, rating, notes):
if not TASK_B_ITEMS:
return (
index,
"No Task B items available.",
None,
None,
"",
"",
"",
"",
"",
"",
"",
)
if not evaluator_name.strip() or not session_id.strip():
raise gr.Error("Enter evaluator name and click Start Session first.")
if rating not in TASK_B_LABELS:
raise gr.Error("Select a Task B rating before submitting.")
item = TASK_B_ITEMS[index % len(TASK_B_ITEMS)]
record = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"session_id": session_id,
"evaluator": evaluator_name.strip(),
"task": "task_b",
"item_index": index,
"question_id": item["question_id"],
"question": item["question"],
"answer": item["answer"],
"viz_a": item["viz_a"],
"viz_b": item["viz_b"],
"em_a": item["em_a"],
"em_b": item["em_b"],
"rating": rating,
"notes": notes.strip(),
}
save_record(record, TASK_B_RESPONSES, TASK_B_CSV)
next_index = (index + 1) % len(TASK_B_ITEMS)
image_a, image_b, image_status, progress, qid, question, answer, meta, _ = (
task_b_payload(next_index)
)
return (
next_index,
f"Saved Task B response for {record['question_id']}.",
image_a,
image_b,
image_status,
progress,
qid,
question,
answer,
meta,
"",
)
with gr.Blocks(title="StructViz-Bench Human Evaluation") as demo:
gr.Markdown(
"# StructViz-Bench Human Evaluation\n"
"Use Task A to verify answer correctness and Task B to judge whether "
"visualization-sensitive failures look plausible. Responses are saved to "
"local JSONL and CSV files in `responses/`."
)
with gr.Row():
evaluator_name = gr.Textbox(
label="Evaluator Name", placeholder="e.g. evaluator_1"
)
session_id = gr.Textbox(label="Session ID", interactive=False)
start_button = gr.Button("Start Session", variant="primary")
session_status = gr.Markdown("Enter your name and start a session.")
with gr.Tab("Task A - Answer Correctness"):
task_a_index = gr.State(0)
task_a_image = gr.Image(label="Visualization", type="filepath", height=420)
task_a_image_status = gr.Markdown()
task_a_progress = gr.Textbox(label="Progress", interactive=False)
task_a_qid = gr.Textbox(label="Question ID", interactive=False)
task_a_question = gr.Textbox(label="Question", interactive=False, lines=2)
task_a_answer = gr.Textbox(label="Ground-Truth Answer", interactive=False)
task_a_meta = gr.Textbox(label="Metadata", interactive=False)
task_a_viz = gr.Textbox(label="Visualization Type", interactive=False)
task_a_rating = gr.Radio(TASK_A_LABELS, label="Judgment")
task_a_notes = gr.Textbox(label="Notes", lines=3)
task_a_submit = gr.Button("Submit Task A", variant="primary")
task_a_status = gr.Markdown()
with gr.Tab("Task B - Sensitivity Plausibility"):
task_b_index = gr.State(0)
with gr.Row():
task_b_image_a = gr.Image(
label="Visualization A", type="filepath", height=360
)
task_b_image_b = gr.Image(
label="Visualization B", type="filepath", height=360
)
task_b_image_status = gr.Markdown()
task_b_progress = gr.Textbox(label="Progress", interactive=False)
task_b_qid = gr.Textbox(label="Question ID", interactive=False)
task_b_question = gr.Textbox(label="Question", interactive=False, lines=2)
task_b_answer = gr.Textbox(label="Ground-Truth Answer", interactive=False)
task_b_meta = gr.Textbox(label="Pair Metadata", interactive=False)
task_b_rating = gr.Radio(TASK_B_LABELS, label="Judgment")
task_b_notes = gr.Textbox(label="Notes", lines=3)
task_b_submit = gr.Button("Submit Task B", variant="primary")
task_b_status = gr.Markdown()
start_button.click(
start_session,
inputs=[evaluator_name, session_id],
outputs=[session_id, session_status],
api_name="start_session",
)
demo.load(
task_a_payload,
inputs=task_a_index,
outputs=[
task_a_image,
task_a_image_status,
task_a_progress,
task_a_qid,
task_a_question,
task_a_answer,
task_a_meta,
task_a_viz,
task_a_notes,
],
api_name="load_task_a",
)
demo.load(
task_b_payload,
inputs=task_b_index,
outputs=[
task_b_image_a,
task_b_image_b,
task_b_image_status,
task_b_progress,
task_b_qid,
task_b_question,
task_b_answer,
task_b_meta,
task_b_notes,
],
api_name="load_task_b",
)
task_a_submit.click(
submit_task_a,
inputs=[task_a_index, evaluator_name, session_id, task_a_rating, task_a_notes],
outputs=[
task_a_index,
task_a_status,
task_a_image,
task_a_image_status,
task_a_progress,
task_a_qid,
task_a_question,
task_a_answer,
task_a_meta,
task_a_viz,
task_a_notes,
],
api_name="submit_task_a",
)
task_b_submit.click(
submit_task_b,
inputs=[task_b_index, evaluator_name, session_id, task_b_rating, task_b_notes],
outputs=[
task_b_index,
task_b_status,
task_b_image_a,
task_b_image_b,
task_b_image_status,
task_b_progress,
task_b_qid,
task_b_question,
task_b_answer,
task_b_meta,
task_b_notes,
],
api_name="submit_task_b",
)
demo.get_api_info = lambda: {
"named_endpoints": {},
"unnamed_endpoints": {},
}
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", share=True, show_api=False)