Spaces:

peiranli0930
/

VisArena

Sleeping

App Files Files Community

Peiran commited on Oct 23

Commit

6109248

1 Parent(s): 1688aaf

Login required + per-session batch (20), min raters filter (20), periodic reload, spaced repeats, and metrics logging (duration, flat scores); add hidden states and CSV fields; enforce HF-auth annotator id

Browse files

Files changed (2) hide show

__pycache__/app.cpython-311.pyc +0 -0
app.py +215 -9

__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-311.pyc and b/__pycache__/app.cpython-311.pyc differ

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import random
 import json
 import os
 import uuid
-from datetime import datetime
 from io import BytesIO
 from typing import Dict, List, Tuple, Optional
@@ -19,6 +19,13 @@ BASE_DIR = os.path.dirname(__file__)
 PERSIST_DIR = os.environ.get("PERSIST_DIR", "/data")
 # Persistent local storage inside HF Spaces
 PERSIST_DIR = os.environ.get("PERSIST_DIR", "/data")
 TASK_CONFIG = {
     "Scene Composition & Object Insertion": {
         "folder": "scene_composition_and_object_insertion",
@@ -171,6 +178,39 @@ def _read_user_done_keys(task_name: str, annotator_id: str) -> set:
     return keys
 def _schedule_round_robin_by_test_id(pairs: List[Dict[str, str]], seed: Optional[int] = None) -> List[Dict[str, str]]:
     """Interleave pairs across test_ids for balanced coverage; shuffle within each group.
     """
@@ -203,8 +243,13 @@ def load_task(task_name: str, annotator_id: str = ""):
     def key_of(p: Dict[str, str]):
         return (p["test_id"], frozenset({p["model1_name"], p["model2_name"]}), p["org_img"])
     user_done_keys = _read_user_done_keys(task_name, annotator_id)
     global_counts = _read_eval_counts(task_name)
-    pairs = [p for p in pairs_all if key_of(p) not in user_done_keys]
     # Balanced schedule: prioritize low-count pairs, and within same count do round-robin by test_id
     seed_env = os.environ.get("SCHEDULE_SEED")
@@ -219,6 +264,50 @@ def load_task(task_name: str, annotator_id: str = ""):
         ordered.extend(_schedule_round_robin_by_test_id(buckets[c], seed=seed))
     pairs = ordered
     # Assign A/B order to counteract position bias: alternate after scheduling
     for idx, p in enumerate(pairs):
         p["swap"] = bool(idx % 2)  # True -> A=B's image; False -> A=A's image
@@ -270,6 +359,13 @@ def _append_local_persist_csv(task_name: str, row: Dict[str, object]) -> bool:
     fieldnames = [
         "eval_date",
         "annotator_id",
         "test_id",
         "model1_name",
         "model2_name",
@@ -358,8 +454,26 @@ def _extract_annotator_id(request: Optional[gr.Request]) -> str:
     return ""
-def on_task_change(task_name: str, _state_pairs: List[Dict[str, str]], request: gr.Request):
     annotator_id = _extract_annotator_id(request)
     pairs = load_task(task_name, annotator_id)
     # Defaults for A and B (8 sliders total)
     default_scores = [3, 3, 3, 3, 3, 3, 3, 3]
@@ -372,7 +486,11 @@ def on_task_change(task_name: str, _state_pairs: List[Dict[str, str]], request:
             gr.update(value=None),
             gr.update(value=None),
             *default_scores,
-            gr.update(value="No pending pairs. Either all pairs are already evaluated or data paths are invalid."),
         )
     pair = pairs[0]
@@ -389,11 +507,15 @@ def on_task_change(task_name: str, _state_pairs: List[Dict[str, str]], request:
         _resolve_image_path(a_path),
         _resolve_image_path(b_path),
         *default_scores,
-        gr.update(value=f"Total {len(pairs)} pairs pending evaluation."),
     )
-def on_pair_navigate(index: int, pairs: List[Dict[str, str]]):
     if not pairs:
         # Gracefully no-op when no pairs
         return (
@@ -404,6 +526,7 @@ def on_pair_navigate(index: int, pairs: List[Dict[str, str]]):
             gr.update(value=None),
             3, 3, 3, 3,  # A
             3, 3, 3, 3,  # B
         )
     index = int(index)
     index = max(0, min(index, len(pairs) - 1))
@@ -419,6 +542,7 @@ def on_pair_navigate(index: int, pairs: List[Dict[str, str]]):
         _resolve_image_path(b_path),
         3, 3, 3, 3,  # A
         3, 3, 3, 3,  # B
     )
@@ -435,6 +559,10 @@ def on_submit(
     b_semantic_score: int,
     b_overall_score: int,
     request: gr.Request,
 ):
     if not task_name:
         return (
@@ -447,6 +575,10 @@ def on_submit(
             3, 3, 3, 3,
             3, 3, 3, 3,
             gr.update(value="Please select a task first."),
         )
     if not pairs:
@@ -460,6 +592,10 @@ def on_submit(
             3, 3, 3, 3,
             3, 3, 3, 3,
             gr.update(value="No pending pairs to submit."),
         )
     # Resolve annotator id from request
@@ -494,6 +630,20 @@ def on_submit(
     # Build record
     row = _build_eval_row(pair, score_map)
     row["annotator_id"] = annotator_id
     # Idempotency: check if this pair already evaluated; if so, skip writing
     done_keys = _read_user_done_keys(task_name, annotator_id)
@@ -518,6 +668,32 @@ def on_submit(
     info = f"{info_prefix} Local persistence " + ("succeeded" if ok_local else "skipped/failed") + "."
     info += " Dataset upload " + ("succeeded" if ok_hub else "failed") + (f" ({hub_msg})" if hub_msg else "") + "."
     if remaining_pairs:
         next_index = min(index, len(remaining_pairs) - 1)
         pair = remaining_pairs[next_index]
@@ -534,6 +710,10 @@ def on_submit(
             3, 3, 3, 3,
             3, 3, 3, 3,
             gr.update(value=info + f" Next pair ({next_index + 1}/{len(remaining_pairs)})."),
         )
     # No remaining pairs: clear UI, hide slider, and return updated empty state
@@ -547,6 +727,10 @@ def on_submit(
         3, 3, 3, 3,
         3, 3, 3, 3,
         gr.update(value=info + " All pairs completed."),
     )
@@ -576,6 +760,11 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
         )
     pair_state = gr.State([])
     pair_header = gr.Markdown("")
@@ -604,7 +793,7 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
         # Event bindings
         task_selector.change(
             fn=on_task_change,
-            inputs=[task_selector, pair_state],
             outputs=[
                 pair_state,
                 index_slider,
@@ -621,12 +810,16 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
                 b_semantic_input,
                 b_overall_input,
                 feedback_box,
             ],
         )
         index_slider.release(
             fn=on_pair_navigate,
-            inputs=[index_slider, pair_state],
             outputs=[
                 index_slider,
                 pair_header,
@@ -641,6 +834,7 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
                 b_optical_input,
                 b_semantic_input,
                 b_overall_input,
             ],
         )
@@ -658,6 +852,10 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
                 b_optical_input,
                 b_semantic_input,
                 b_overall_input,
             ],
             outputs=[
                 pair_state,
@@ -675,13 +873,17 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
                 b_semantic_input,
                 b_overall_input,
                 feedback_box,
             ],
         )
         # Auto-load default task on startup
         demo.load(
             fn=on_task_change,
-            inputs=[task_selector, pair_state],
             outputs=[
                 pair_state,
                 index_slider,
@@ -698,6 +900,10 @@ with gr.Blocks(title="VisArena Human Evaluation") as demo:
                 b_semantic_input,
                 b_overall_input,
                 feedback_box,
             ],
         )

 import json
 import os
 import uuid
+from datetime import datetime, timedelta
 from io import BytesIO
 from typing import Dict, List, Tuple, Optional
 PERSIST_DIR = os.environ.get("PERSIST_DIR", "/data")
 # Persistent local storage inside HF Spaces
 PERSIST_DIR = os.environ.get("PERSIST_DIR", "/data")
+# Evaluation knobs (can be overridden via env vars)
+MIN_RATERS_PER_PAIR = int(os.environ.get("MIN_RATERS_PER_PAIR", 20))
+BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 20))
+RELOAD_EVERY = int(os.environ.get("RELOAD_EVERY", 5))
+REPEAT_RATE = float(os.environ.get("REPEAT_RATE", 0.05))  # fraction of repeats within batch
+REPEAT_MIN_HOURS = float(os.environ.get("REPEAT_MIN_HOURS", 24))
+FAST_MIN_SEC = float(os.environ.get("FAST_MIN_SEC", 2.0))
 TASK_CONFIG = {
     "Scene Composition & Object Insertion": {
         "folder": "scene_composition_and_object_insertion",
     return keys
+def _read_user_last_times(task_name: str, annotator_id: str) -> Dict[Tuple[str, frozenset, str], datetime]:
+    """Return the user's last evaluation datetime per pair key."""
+    last: Dict[Tuple[str, frozenset, str], datetime] = {}
+    if not annotator_id:
+        return last
+    csv_path = _persist_csv_path_for_task(task_name)
+    if not os.path.exists(csv_path):
+        return last
+    try:
+        with open(csv_path, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for r in reader:
+                if str(r.get("annotator_id", "")).strip() != str(annotator_id).strip():
+                    continue
+                tid = str(r.get("test_id", "")).strip()
+                m1 = str(r.get("model1_name", "")).strip()
+                m2 = str(r.get("model2_name", "")).strip()
+                org = str(r.get("org_img", "")).strip()
+                dt = str(r.get("eval_date", "")).strip() or str(r.get("submit_ts", "")).strip()
+                if not (tid and m1 and m2 and org and dt):
+                    continue
+                key = (tid, frozenset({m1, m2}), org)
+                try:
+                    t = datetime.fromisoformat(dt)
+                except Exception:
+                    continue
+                if key not in last or t > last[key]:
+                    last[key] = t
+    except Exception:
+        pass
+    return last
 def _schedule_round_robin_by_test_id(pairs: List[Dict[str, str]], seed: Optional[int] = None) -> List[Dict[str, str]]:
     """Interleave pairs across test_ids for balanced coverage; shuffle within each group.
     """
     def key_of(p: Dict[str, str]):
         return (p["test_id"], frozenset({p["model1_name"], p["model2_name"]}), p["org_img"])
     user_done_keys = _read_user_done_keys(task_name, annotator_id)
+    user_last_times = _read_user_last_times(task_name, annotator_id)
     global_counts = _read_eval_counts(task_name)
+    # Main eligible set: not done by this user and below min raters threshold
+    pairs = [
+        p for p in pairs_all
+        if key_of(p) not in user_done_keys and global_counts.get(key_of(p), 0) < MIN_RATERS_PER_PAIR
+    ]
     # Balanced schedule: prioritize low-count pairs, and within same count do round-robin by test_id
     seed_env = os.environ.get("SCHEDULE_SEED")
         ordered.extend(_schedule_round_robin_by_test_id(buckets[c], seed=seed))
     pairs = ordered
+    # Deterministic rotation by user's progress to avoid always starting from the same pairs
+    try:
+        elig_keys = [key_of(p) for p in pairs]
+        progress = len([k for k in user_done_keys if k in elig_keys])
+        if pairs:
+            rot = progress % len(pairs)
+            pairs = pairs[rot:] + pairs[:rot]
+    except Exception:
+        pass
+    # Limit batch size
+    main_batch = pairs[: max(0, BATCH_SIZE)]
+    # Small proportion of spaced repeats for test-retest
+    repeats: List[Dict[str, str]] = []
+    try:
+        repeat_target = int(max(0, round(BATCH_SIZE * REPEAT_RATE)))
+        if repeat_target > 0 and user_last_times:
+            min_time = datetime.utcnow() - timedelta(hours=REPEAT_MIN_HOURS)
+            candidates = [k for k, t in user_last_times.items() if t < min_time]
+            def find_pair_from_key(k):
+                tid, names, org = k
+                for p in pairs_all:
+                    if p["test_id"] == tid and p["org_img"] == org and frozenset({p["model1_name"], p["model2_name"]}) == names:
+                        return p
+                return None
+            picked = 0
+            used_keys = {key_of(p) for p in main_batch}
+            for k in candidates:
+                if picked >= repeat_target:
+                    break
+                p = find_pair_from_key(k)
+                if not p:
+                    continue
+                if key_of(p) in used_keys:
+                    continue
+                repeats.append(p)
+                used_keys.add(key_of(p))
+                picked += 1
+    except Exception:
+        pass
+    pairs = main_batch + repeats
     # Assign A/B order to counteract position bias: alternate after scheduling
     for idx, p in enumerate(pairs):
         p["swap"] = bool(idx % 2)  # True -> A=B's image; False -> A=A's image
     fieldnames = [
         "eval_date",
         "annotator_id",
+        "session_id",
+        "view_start_ts",
+        "submit_ts",
+        "duration_sec",
+        "is_fast",
+        "is_flat_a",
+        "is_flat_b",
         "test_id",
         "model1_name",
         "model2_name",
     return ""
+def on_task_change(task_name: str, _state_pairs: List[Dict[str, str]], request: gr.Request,
+                   view_started_at: float, session_quota: int, reload_count: int, session_id: str):
     annotator_id = _extract_annotator_id(request)
+    if not annotator_id:
+        default_scores = [3, 3, 3, 3, 3, 3, 3, 3]
+        return (
+            [],
+            gr.update(value=0, minimum=0, maximum=0, visible=False),
+            gr.update(value=""),
+            gr.update(value=None),
+            gr.update(value=None),
+            gr.update(value=None),
+            *default_scores,
+            gr.update(value="请先登录你的 Hugging Face 账户后再开始评测。"),
+            float(datetime.utcnow().timestamp()),
+            BATCH_SIZE,
+            0,
+            session_id or str(uuid.uuid4()),
+        )
     pairs = load_task(task_name, annotator_id)
     # Defaults for A and B (8 sliders total)
     default_scores = [3, 3, 3, 3, 3, 3, 3, 3]
             gr.update(value=None),
             gr.update(value=None),
             *default_scores,
+            gr.update(value="当前没有待评对（或已达到最小标注阈值）。"),
+            float(datetime.utcnow().timestamp()),
+            BATCH_SIZE,
+            0,
+            session_id or str(uuid.uuid4()),
         )
     pair = pairs[0]
         _resolve_image_path(a_path),
         _resolve_image_path(b_path),
         *default_scores,
+        gr.update(value=f"本批次分配 {len(pairs)} 组；目标每对 {MIN_RATERS_PER_PAIR} 人。"),
+        float(datetime.utcnow().timestamp()),
+        BATCH_SIZE,
+        0,
+        session_id or str(uuid.uuid4()),
     )
+def on_pair_navigate(index: int, pairs: List[Dict[str, str]], view_started_at: float):
     if not pairs:
         # Gracefully no-op when no pairs
         return (
             gr.update(value=None),
             3, 3, 3, 3,  # A
             3, 3, 3, 3,  # B
+            float(datetime.utcnow().timestamp()),
         )
     index = int(index)
     index = max(0, min(index, len(pairs) - 1))
         _resolve_image_path(b_path),
         3, 3, 3, 3,  # A
         3, 3, 3, 3,  # B
+        float(datetime.utcnow().timestamp()),
     )
     b_semantic_score: int,
     b_overall_score: int,
     request: gr.Request,
+    view_started_at: float,
+    session_quota: int,
+    reload_count: int,
+    session_id: str,
 ):
     if not task_name:
         return (
             3, 3, 3, 3,
             3, 3, 3, 3,
             gr.update(value="Please select a task first."),
+            float(datetime.utcnow().timestamp()),
+            session_quota,
+            reload_count,
+            session_id,
         )
     if not pairs:
             3, 3, 3, 3,
             3, 3, 3, 3,
             gr.update(value="No pending pairs to submit."),
+            float(datetime.utcnow().timestamp()),
+            session_quota,
+            reload_count,
+            session_id,
         )
     # Resolve annotator id from request
     # Build record
     row = _build_eval_row(pair, score_map)
     row["annotator_id"] = annotator_id
+    # timing + heuristics
+    submit_ts = datetime.utcnow()
+    try:
+        started = datetime.utcfromtimestamp(float(view_started_at)) if view_started_at else submit_ts
+    except Exception:
+        started = submit_ts
+    duration = max(0.0, (submit_ts - started).total_seconds())
+    row["view_start_ts"] = started.isoformat()
+    row["submit_ts"] = submit_ts.isoformat()
+    row["duration_sec"] = round(duration, 3)
+    row["is_fast"] = bool(duration < FAST_MIN_SEC)
+    row["is_flat_a"] = bool(len({int(a_physical_score), int(a_optical_score), int(a_semantic_score), int(a_overall_score)}) == 1)
+    row["is_flat_b"] = bool(len({int(b_physical_score), int(b_optical_score), int(b_semantic_score), int(b_overall_score)}) == 1)
+    row["session_id"] = session_id or str(uuid.uuid4())
     # Idempotency: check if this pair already evaluated; if so, skip writing
     done_keys = _read_user_done_keys(task_name, annotator_id)
     info = f"{info_prefix} Local persistence " + ("succeeded" if ok_local else "skipped/failed") + "."
     info += " Dataset upload " + ("succeeded" if ok_hub else "failed") + (f" ({hub_msg})" if hub_msg else "") + "."
+    # Quota + reload
+    session_quota = max(0, int(session_quota) - 1)
+    reload_count = int(reload_count) + 1
+    # Periodic reload to absorb new results.csv / re-balance
+    if reload_count >= RELOAD_EVERY:
+        fresh_pairs = load_task(task_name, annotator_id)
+        remaining_pairs = fresh_pairs
+        reload_count = 0
+    if session_quota <= 0:
+        return (
+            [],
+            gr.update(value=0, minimum=0, maximum=0, visible=False),
+            gr.update(value=""),
+            gr.update(value=None),
+            gr.update(value=None),
+            gr.update(value=None),
+            3, 3, 3, 3,
+            3, 3, 3, 3,
+            gr.update(value=info + " 本批次已完成 20 组，请刷新页面获取下一批次。"),
+            float(datetime.utcnow().timestamp()),
+            session_quota,
+            reload_count,
+            row["session_id"],
+        )
     if remaining_pairs:
         next_index = min(index, len(remaining_pairs) - 1)
         pair = remaining_pairs[next_index]
             3, 3, 3, 3,
             3, 3, 3, 3,
             gr.update(value=info + f" Next pair ({next_index + 1}/{len(remaining_pairs)})."),
+            float(datetime.utcnow().timestamp()),
+            session_quota,
+            reload_count,
+            row["session_id"],
         )
     # No remaining pairs: clear UI, hide slider, and return updated empty state
         3, 3, 3, 3,
         3, 3, 3, 3,
         gr.update(value=info + " All pairs completed."),
+        float(datetime.utcnow().timestamp()),
+        session_quota,
+        reload_count,
+        row["session_id"],
     )
         )
     pair_state = gr.State([])
+    # Hidden states for session control and metrics
+    view_started_at_state = gr.State(0.0)
+    session_quota_state = gr.State(BATCH_SIZE)
+    reload_count_state = gr.State(0)
+    session_id_state = gr.State("")
     pair_header = gr.Markdown("")
         # Event bindings
         task_selector.change(
             fn=on_task_change,
+            inputs=[task_selector, pair_state, gr.Request(), view_started_at_state, session_quota_state, reload_count_state, session_id_state],
             outputs=[
                 pair_state,
                 index_slider,
                 b_semantic_input,
                 b_overall_input,
                 feedback_box,
+                view_started_at_state,
+                session_quota_state,
+                reload_count_state,
+                session_id_state,
             ],
         )
         index_slider.release(
             fn=on_pair_navigate,
+            inputs=[index_slider, pair_state, view_started_at_state],
             outputs=[
                 index_slider,
                 pair_header,
                 b_optical_input,
                 b_semantic_input,
                 b_overall_input,
+                view_started_at_state,
             ],
         )
                 b_optical_input,
                 b_semantic_input,
                 b_overall_input,
+                view_started_at_state,
+                session_quota_state,
+                reload_count_state,
+                session_id_state,
             ],
             outputs=[
                 pair_state,
                 b_semantic_input,
                 b_overall_input,
                 feedback_box,
+                view_started_at_state,
+                session_quota_state,
+                reload_count_state,
+                session_id_state,
             ],
         )
         # Auto-load default task on startup
         demo.load(
             fn=on_task_change,
+            inputs=[task_selector, pair_state, gr.Request(), view_started_at_state, session_quota_state, reload_count_state, session_id_state],
             outputs=[
                 pair_state,
                 index_slider,
                 b_semantic_input,
                 b_overall_input,
                 feedback_box,
+                view_started_at_state,
+                session_quota_state,
+                reload_count_state,
+                session_id_state,
             ],
         )