Spaces:

RedRocket
/

e6-visual-ratings

Running

App Files Files Community

rht

by RedHotTensors - opened 28 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+373

-468

Files changed (5) hide show

app.py +241 -304
compact_logs.py +41 -28
explorer.py +0 -6
stats_from_logs.py +32 -30
storage.py +59 -100

app.py CHANGED Viewed

@@ -5,9 +5,6 @@ import time
 import uuid
 import os
 import html
-import sys
-from typing import Callable
 import pandas as pd
 from huggingface_hub import hf_hub_download
@@ -32,161 +29,110 @@ _pool_path = hf_hub_download(
     token=RATINGS_APP_TOKEN
 )
 _pool_df = pd.read_parquet(_pool_path)
-_pool_df[["wins", "losses", "ties", "votes", "winrate"]] = (0, 0, 0, 0, 0.0)
-WINS_LOC = _pool_df.columns.get_loc("wins")
-LOSSES_LOC = _pool_df.columns.get_loc("losses")
-TIES_LOC = _pool_df.columns.get_loc("ties")
-VOTES_LOC = _pool_df.columns.get_loc("votes")
-WINRATE_LOC = _pool_df.columns.get_loc("winrate")
-_md5_to_idx = { md5: idx for idx, md5 in enumerate(_pool_df["md5"]) }
-_pool_lock = threading.Lock()
 _stats_last_loaded_at = 0.0
 _explorer_df = pd.DataFrame(columns=["group", "id", "md5", "rating", "sample_url", "image_url", "classifier", "classifier_score", "percentile"])
-def _load_stats() -> None:
-    VOTE_STORAGE.sync()
-    load_stats_by_md5(repo_id=POOL_REPO_ID, token=RATINGS_APP_TOKEN)
-    n_missing = 0
-    with _pool_lock:
-        VOTE_STORAGE.sync()
-        stats_by_key = load_stats_by_md5(repo_id=POOL_REPO_ID, token=RATINGS_APP_TOKEN)
-        for md5, stats in stats_by_key.items():
-            if (idx := _md5_to_idx.get(md5)) is not None:
-                _pool_df.iloc[idx, [WINS_LOC, LOSSES_LOC, TIES_LOC, VOTES_LOC, WINRATE_LOC]] = (
-                    stats.wins, stats.losses, stats.ties, stats.votes, stats.winrate
-                )
-            else:
-                n_missing += 1
-    if n_missing:
-        print(f"{n_missing} md5s have stats but are not in the pool!", file=sys.stderr)
-    classifier_scores_path = hf_hub_download(
-        repo_id=POOL_REPO_ID,
-        filename="classifier_scores.parquet",
-        repo_type="dataset",
-        token=RATINGS_APP_TOKEN,
-    )
-    validation_set_path = hf_hub_download(
-        repo_id=POOL_REPO_ID,
-        filename="validation_set.parquet",
-        repo_type="dataset",
-        token=RATINGS_APP_TOKEN,
-    )
-    validation_df = pd.read_parquet(
-        validation_set_path,
-        columns=["group", "id", "md5", "rating", "sample_url", "image_url"],
-    )
-    classifier_scores_df = pd.read_parquet(classifier_scores_path)
-    assert {"classifier", "md5", "classifier_score", "percentile"}.issubset(classifier_scores_df.columns), "classifier_scores.parquet missing expected columns"
-    classifier_scores_df = classifier_scores_df[["classifier", "md5", "classifier_score", "percentile"]]
-    classifier_scores_df["classifier"] = classifier_scores_df["classifier"].astype(str)
-    classifier_scores_df["md5"] = classifier_scores_df["md5"].astype(str)
-    validation_df["md5"] = validation_df["md5"].astype(str)
-    global _explorer_df
-    _explorer_df = validation_df.merge(classifier_scores_df, on="md5", how="left", validate="one_to_many")
-def _stats_reloader() -> None:
-    while True:
-        time.sleep(STATS_RELOAD_S)
-        _load_stats()
-_load_stats()
-threading.Thread(target=_stats_reloader, daemon=True).start()
-def _pick_from(df: pd.DataFrame, *, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
-    if len(df) < 2:
-        return None
-    sample = df.sample(2, weights=weights, replace=False)
-    return sample.iloc[0], sample.iloc[1], len(df)
-def _pick_similar(
-    df: pd.DataFrame,
-    distance: Callable[[pd.DataFrame, pd.Series], pd.Series],
-    *,
-    weights: Callable[[pd.DataFrame], pd.Series] | None = None,
-    other_df: pd.DataFrame | None = None,
-) -> tuple[pd.Series, pd.Series, int] | None:
-    if len(df) < 2:
-        return None
-    if other_df is None:
-        other_df = df
-    elif len(other_df) < 2:
-        return None
-    weight_vals: pd.Series | None = None
-    if weights is not None:
-        weight_vals = weights(df)
-    first = df.sample(weights=weight_vals).iloc[0]
-    weight_vals = 1.0 / (1.0 + distance(other_df, first))
-    while True:
-        other = other_df.sample(weights=weight_vals).iloc[0]
-        if other["md5"] != first["md5"]:
-            return first, other, len(df)
-def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
-    gdf = _pool_df[_pool_df["group"] == group]
-    voted = gdf[gdf["votes"] > 0]
-    votes = voted["votes"]
-    # Pair first-time winners.
-    picked = _pick_from(voted[(votes == 1) & (voted["wins"] == 1)])
-    if picked is not None:
-        return *picked, "new-winners"
-    # Pair first-time losers.
-    picked = _pick_from(voted[(votes == 1) & (voted["losses"] == 1)])
-    if picked is not None:
-        return *picked, "new-losers"
-    def record_distance(df: pd.DataFrame, pivot: pd.Series) -> pd.Series:
-        return (
-            (df["wins"] - pivot["wins"])**2 +
-            (df["losses"] - pivot["losses"])**2
-        )**0.75 # L2 is a bit too loose
-    # Link cliques to main network and break ties.
-    nonties = votes - voted["ties"]
-    picked = _pick_similar(
-        voted[(nonties == 0) | (votes == 2)],
-        record_distance,
-        other_df=voted[nonties > 3],
-    )
-    if picked is not None:
-        return *picked, "sparse"
-    # Introduce new images.
-    if len(voted) < 8 or random.random() < 0.33:
-        unvoted = gdf[gdf["votes"] == 0]
-        match len(unvoted):
-            case 0:
-                pass
-            case 1:
-                return unvoted.iloc[0], voted.iloc[0], 1, "new"
-            case _:
-                picked = _pick_from(unvoted)
-                assert picked is not None
-                return *picked, "new"
-    # Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
-    picked = _pick_similar(
-        voted, record_distance,
-        weights=lambda df: 1.0 / (df["votes"]**1.25 + 0.1 * df["losses"]),
-    )
-    assert picked is not None
-    return *picked, "fair-probe"
 def _row_image_url(row) -> str:
     sample_url = row.get("sample_url")
@@ -202,47 +148,89 @@ DATASETS: dict[str, dict] = {
         "fetch_pair": _pool_fetch_pair,
         "get_id": lambda row: row["md5"],
         "get_image": _row_image_url,
-        "groups": sorted(_pool_df["group"].unique()),
     },
 }
 DEFAULT_DATASET = list(DATASETS.keys())[0]
-def _format_rating_post_title(post_id: int, votes: int, label: str) -> str:
-    return f"<strong>{label}</strong>: <a href=\"https://e621.net/posts/{post_id}\" target=\"_blank\" rel=\"noreferrer\">Post #{post_id}</a> | {votes} {'Vote' if votes == 1 else 'Votes'}"
 def _render_current(state: dict, submit_status: str = "") -> tuple:
-    votes_a = _pool_df.iloc[_md5_to_idx[state["key_a"]], VOTES_LOC]
-    votes_b = _pool_df.iloc[_md5_to_idx[state["key_b"]], VOTES_LOC]
-    title_a = _format_rating_post_title(state["id_a"], votes_a, "Image A")
-    title_b = _format_rating_post_title(state["id_b"], votes_b, "Image B")
-    img_a_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\">{title_a}</div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_a'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
-    img_b_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\">{title_b}</div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_b'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
-    can_go_back = bool(state.get("pending", ()))
-    pair_details = f"/ {state['group']} / {state.get('pair_reason', 'unknown')}"
-    return img_a_html, img_b_html, gr.Button(interactive=can_go_back), html.escape(pair_details), html.escape(submit_status), state
 def _normalize_rating_pref(pref: str | None) -> str:
     return pref if pref in ("safe", "all") else "safe"
-def _initial_load(state: dict, rating_pref: str | None, submit_key: str | None, image_height: str, groups: list[str]):
-    rating_pref = _normalize_rating_pref(rating_pref)
     submit_key = _normalize_submit_key(submit_key)
-    return rating_pref, submit_key, image_height, image_height, groups, *new_round(DEFAULT_DATASET, groups, state)
-def _on_groups_change(groups: list[str], state: dict):
-    return *new_round(DEFAULT_DATASET, groups, state), groups
-def _on_image_height_change(image_height: str) -> tuple[str, str]:
-    return image_height, image_height
 def _normalize_submit_key(submit_key: str | None) -> str:
-    return (submit_key or "").strip()
 def _filtered_explorer_df(rating_pref: str) -> pd.DataFrame:
     return _filtered_explorer_df_by_classifier(rating_pref, ALLOWED_CLASSIFIER_FILTERS[0])
 def _filtered_explorer_df_by_classifier(rating_pref: str, classifier_name: str) -> pd.DataFrame:
     if rating_pref == "all":
         rating_filtered = _explorer_df
@@ -252,10 +240,12 @@ def _filtered_explorer_df_by_classifier(rating_pref: str, classifier_name: str)
     assert classifier_name in ALLOWED_CLASSIFIER_FILTERS, f"Unsupported classifier filter: {classifier_name}"
     return rating_filtered[rating_filtered["classifier"] == classifier_name]
-def _load_results(rating_pref: str, sort_mode: str, classifier_filter: str):
-    rating_pref = _normalize_rating_pref(rating_pref)
-    sort_mode = _normalize_sort_mode(sort_mode)
-    classifier_name = _normalize_classifier_filter(classifier_filter)
     filtered_explorer_df = _filtered_explorer_df_by_classifier(rating_pref, classifier_name)
     summary, score_distribution_plot, distribution_data, gallery_items, page_meta, next_offset, btn_update = build_results_data(
         filtered_explorer_df,
@@ -266,11 +256,13 @@ def _load_results(rating_pref: str, sort_mode: str, classifier_filter: str):
     )
     return summary, score_distribution_plot, distribution_data, gallery_items, btn_update, "Click an image to reveal its ID and link.", page_meta, next_offset
 def _normalize_sort_mode(sort_mode: str | None) -> str:
     if sort_mode in ("Default", "Rating: Low to High", "Rating: High to Low"):
         return sort_mode
     return "Default"
 def _normalize_classifier_filter(classifier_name: str | None) -> str:
     if classifier_name in ALLOWED_CLASSIFIER_FILTERS:
         return str(classifier_name)
@@ -278,22 +270,23 @@ def _normalize_classifier_filter(classifier_name: str | None) -> str:
 # -- Gradio callbacks -------------------------------------------------------
-def new_round(dataset_name: str, groups: list[str], state: dict) -> tuple:
-    if not groups:
-        return "", "", gr.skip(), "", "Please select at least one group.", state
     cfg = DATASETS[dataset_name]
     group = random.choice(groups)
-    row_a, row_b, reason_remaining, pair_reason = cfg["fetch_pair"](group)
-    pair_reason = f"{pair_reason} ({reason_remaining})"
     state.setdefault("session_id", uuid.uuid4().hex)
     key_a = cfg["get_id"](row_a)
     key_b = cfg["get_id"](row_b)
     id_a = int(row_a["id"])
     id_b = int(row_b["id"])
-    state.update(dataset=dataset_name, key_a=key_a, key_b=key_b, id_a=id_a, id_b=id_b, group=group, pair_reason=pair_reason)
     url_a = cfg["get_image"](row_a)
     url_b = cfg["get_image"](row_b)
     state["url_a"] = url_a
@@ -302,9 +295,8 @@ def new_round(dataset_name: str, groups: list[str], state: dict) -> tuple:
 def _queue_decision(winner: str | None, state: dict):
     assert state.get("session_id"), "Missing session_id: refusing to record vote"
-    pending = state.setdefault("pending", [])
-    pending.append({
         "winner": winner,
         "key_a": state["key_a"],
         "key_b": state["key_b"],
@@ -313,86 +305,53 @@ def _queue_decision(winner: str | None, state: dict):
         "url_a": state["url_a"],
         "url_b": state["url_b"],
         "dataset": state["dataset"],
         "group": state["group"],
         "pair_reason": state.get("pair_reason", ""),
         "session_id": state["session_id"],
-    })
-    if len(pending) > 1:
-        VOTE_STORAGE.queue_row(pending.pop(0))
-def _add_vote(idx: int, col_loc: int, delta: int = 1) -> None:
-    _pool_df.iloc[idx, [col_loc, VOTES_LOC]] += delta
-    wins, ties, votes = _pool_df.iloc[idx, [WINS_LOC, TIES_LOC, VOTES_LOC]]
-    _pool_df.iloc[idx, WINRATE_LOC] = (wins + 0.5 * ties) / max(votes, 1)
-def vote(winner: str | None, state: dict, groups: list[str], submit_key: str | None) -> tuple:
     if _normalize_submit_key(submit_key) != SUBMIT_KEY:
         return _render_current(state, "Wrong submission key.")
-    if not groups:
-        return "", "", gr.skip(), "", "Please select at least one group.", state
     _queue_decision(winner, state)
-    a_idx = _md5_to_idx[state["key_a"]]
-    b_idx = _md5_to_idx[state["key_b"]]
-    with _pool_lock:
-        match winner:
-            case "A":
-                _add_vote(a_idx, WINS_LOC)
-                _add_vote(b_idx, LOSSES_LOC)
-            case "B":
-                _add_vote(a_idx, LOSSES_LOC)
-                _add_vote(b_idx, WINS_LOC)
-            case None:
-                _add_vote(a_idx, TIES_LOC)
-                _add_vote(b_idx, TIES_LOC)
-            case _:
-                raise AssertionError
-    return new_round(state["dataset"], groups, state)
 def go_back(state: dict) -> tuple:
     pending = state.setdefault("pending", [])
-    if pending:
-        last = pending.pop()
-        state.update(
-            dataset=last["dataset"],
-            key_a=last["key_a"],
-            key_b=last["key_b"],
-            id_a=last["id_a"],
-            id_b=last["id_b"],
-            url_a=last["url_a"],
-            url_b=last["url_b"],
-            group=last["group"],
-            pair_reason=last.get("pair_reason", ""),
-        )
-        a_idx = _md5_to_idx[state["key_a"]]
-        b_idx = _md5_to_idx[state["key_b"]]
-        with _pool_lock:
-            match last["winner"]:
-                case "A":
-                    _add_vote(a_idx, WINS_LOC, -1)
-                    _add_vote(b_idx, LOSSES_LOC, -1)
-                case "B":
-                    _add_vote(a_idx, LOSSES_LOC, -1)
-                    _add_vote(b_idx, WINS_LOC, -1)
-                case None:
-                    _add_vote(a_idx, TIES_LOC, -1)
-                    _add_vote(b_idx, TIES_LOC, -1)
-                case _:
-                    raise AssertionError
     return _render_current(state)
 # -- UI ---------------------------------------------------------------------
 with gr.Blocks(
-    title="e621 Visual Ratings",
     head="""
     <script>
     const VOTE_COOLDOWN_MS = 1500;
@@ -475,15 +434,9 @@ with gr.Blocks(
         } else if ((e.key === 'z' || e.key === 'Z') && (e.ctrlKey || e.metaKey) && ratingTabActive) {
             e.preventDefault();
             backButton?.click();
-        } else if (e.key === 'ArrowDown') {
-            if (ratingTabActive) {
-                e.preventDefault();
-                backButton?.click();
-            }
-            if (resultsTabActive) {
-                e.preventDefault();
-                resultsLoadMoreButton?.click();
-            }
         }
     });
     document.addEventListener('click', function (e) {
@@ -543,6 +496,7 @@ with gr.Blocks(
     }
     .rating-image-frame {
         width: 100%;
         border: 1px solid #e6e6e6;
         border-radius: 8px;
         background: #333;
@@ -604,23 +558,13 @@ with gr.Blocks(
         object-fit: contain !important;
         background: #1f2937;
     }
-    a {
-        padding: 0 !important;
-    }
     """,
-    fill_width=True,
 ) as demo:
     state = gr.State({})
     rating_pref_store = gr.BrowserState(default_value="safe", storage_key="rating_pref")
     submit_key_store = gr.BrowserState(default_value="", storage_key="submit_key")
     results_sort_store = gr.BrowserState(default_value="Default", storage_key="results_sort_mode")
     results_classifier_store = gr.BrowserState(default_value=ALLOWED_CLASSIFIER_FILTERS[0], storage_key="results_classifier")
-    image_height_store = gr.BrowserState(default_value=768, storage_key="image_height")
-    groups_store = gr.BrowserState(default_value=[
-        group
-        for group in DATASETS[DEFAULT_DATASET]["groups"]
-        if group.endswith("_safe")
-    ], storage_key="groups")
     with gr.Tabs():
         with gr.Tab("Image Quality Rater"):
@@ -630,41 +574,38 @@ with gr.Blocks(
                 img_a = gr.HTML(elem_id="img-a")
                 img_b = gr.HTML(elem_id="img-b")
-            with gr.Row(equal_height=True):
-                btn_a = gr.Button("⬅️ Prefer A", variant="primary", elem_id="btn-vote-a")
-                with gr.Column(scale=0), gr.Group():
-                    btn_skip = gr.Button("⬆️ Same Quality", elem_id="btn-skip")
-                    btn_back_action = gr.Button("⬇️ Undo", elem_id="btn-back-action")
-                btn_b = gr.Button("➡️ Prefer B", variant="primary", elem_id="btn-vote-b")
             with gr.Accordion("Settings", open=False):
-                groups_select = gr.CheckboxGroup(
-                    choices=DATASETS[DEFAULT_DATASET]["groups"],
-                    label="Categories",
-                    show_label=True,
-                    show_select_all=True
-                )
-                image_height_slider = gr.Slider(
-                    minimum=512, maximum=2048, step=16, precision=0,
-                    label="Image Size",
                 )
                 submit_key_tb = gr.Textbox(
                     value="",
                     type="password",
-                    label="Submit Key",
                     elem_id="submit-key",
                 )
-            pair_details = gr.HTML(html_template="Dataset: <a href='https://huggingface.co/datasets/taigasan/e6-visual-ratings' target='_blank' rel='noopener noreferrer'>taigasan/e6-visual-ratings</a> ${value}")
-            submit_status = gr.HTML(html_template="<span class='submit-status-msg'>${value}</span>")
-            gr.HTML("<span class='subtle-note'>Keyboard Shortcuts: ⬅️ Vote A, ⬆️ Same Quality, ➡️ Vote B, ⬇️ or Ctrl+Z Undo</span>")
-            image_height = gr.HTML(html_template="<style>.rating-image-frame { height:${value}px; }</style>", apply_default_css=False)
         (
             results_summary_md,
-            results_rating_dd,
             results_sort_dd,
             results_classifier_dd,
             results_score_distribution_plot,
@@ -676,7 +617,7 @@ with gr.Blocks(
             results_page_offset_state,
         ) = add_results_tab(_pool_df)
-    outputs = [img_a, img_b, btn_back_action, pair_details, submit_status, state]
     results_outputs = [
         results_summary_md,
         results_score_distribution_plot,
@@ -688,26 +629,22 @@ with gr.Blocks(
         results_page_offset_state,
     ]
-    btn_a.click(fn=lambda s, g, k: vote("A", s, g, k), inputs=[state, groups_store, submit_key_store], outputs=outputs, queue=False, show_progress="hidden")
-    btn_b.click(fn=lambda s, g, k: vote("B", s, g, k), inputs=[state, groups_store, submit_key_store], outputs=outputs, queue=False, show_progress="hidden")
-    btn_skip.click(fn=lambda s, g, k: vote(None, s, g, k), inputs=[state, groups_store, submit_key_store], outputs=outputs, queue=False, show_progress="hidden")
     btn_back_action.click(fn=go_back, inputs=[state], outputs=outputs, queue=False, show_progress="hidden")
     submit_key_tb.change(fn=_normalize_submit_key, inputs=[submit_key_tb], outputs=[submit_key_store], queue=False, show_progress="hidden")
-    groups_select.change(fn=_on_groups_change, inputs=[groups_select, state], outputs=[*outputs, groups_store], queue=False, show_progress="hidden")
-    image_height_slider.change(fn=_on_image_height_change, inputs=[image_height_slider], outputs=[image_height_store, image_height], queue=False, show_progress="hidden")
-    results_rating_dd.change(fn=_normalize_rating_pref, inputs=[results_rating_dd], outputs=[rating_pref_store], queue=False, show_progress="hidden")
-    results_rating_dd.change(fn=_load_results, inputs=[results_rating_dd, results_sort_store, results_classifier_store], outputs=results_outputs, queue=False, show_progress="hidden")
     results_sort_dd.change(fn=_normalize_sort_mode, inputs=[results_sort_dd], outputs=[results_sort_store], queue=False, show_progress="hidden")
     results_sort_dd.change(fn=_load_results, inputs=[rating_pref_store, results_sort_dd, results_classifier_store], outputs=results_outputs, queue=False, show_progress="hidden")
     results_classifier_dd.change(fn=_normalize_classifier_filter, inputs=[results_classifier_dd], outputs=[results_classifier_store], queue=False, show_progress="hidden")
     results_classifier_dd.change(fn=_load_results, inputs=[rating_pref_store, results_sort_store, results_classifier_dd], outputs=results_outputs, queue=False, show_progress="hidden")
-    demo.load(fn=_initial_load, inputs=[state, rating_pref_store, submit_key_store, image_height_store, groups_store], outputs=[results_rating_dd, submit_key_tb, image_height_slider, image_height, groups_select, *outputs], queue=False, show_progress="hidden")
     demo.load(fn=_load_results, inputs=[rating_pref_store, results_sort_store, results_classifier_store], outputs=results_outputs, queue=False, show_progress="hidden")
     demo.load(fn=_normalize_sort_mode, inputs=[results_sort_store], outputs=[results_sort_dd], queue=False, show_progress="hidden")
     demo.load(fn=_normalize_classifier_filter, inputs=[results_classifier_store], outputs=[results_classifier_dd], queue=False, show_progress="hidden")
     results_load_more_btn.click(
         fn=lambda r, s, c, o: load_more_results(_filtered_explorer_df_by_classifier(_normalize_rating_pref(r), _normalize_classifier_filter(c)), _explorer_df, s, o),
         inputs=[rating_pref_store, results_sort_store, results_classifier_store, results_page_offset_state],

 import uuid
 import os
 import html
 import pandas as pd
 from huggingface_hub import hf_hub_download
     token=RATINGS_APP_TOKEN
 )
 _pool_df = pd.read_parquet(_pool_path)
+_pool_group_dfs = {g: gdf for g, gdf in _pool_df.groupby("group")}
+_stats_lock = threading.Lock()
 _stats_last_loaded_at = 0.0
+_stats_by_key: dict[str, tuple[int, int]] = {}
 _explorer_df = pd.DataFrame(columns=["group", "id", "md5", "rating", "sample_url", "image_url", "classifier", "classifier_score", "percentile"])
+def _reload_stats_if_due(force: bool = False):
+    global _stats_last_loaded_at, _stats_by_key, _explorer_df
+    now = time.time()
+    if not force and (now - _stats_last_loaded_at) < STATS_RELOAD_S:
+        return
+    with _stats_lock:
+        now = time.time()
+        if not force and (now - _stats_last_loaded_at) < STATS_RELOAD_S:
+            return
+        _stats_by_key = load_stats_by_md5(
+            repo_id=POOL_REPO_ID,
+            token=RATINGS_APP_TOKEN,
+        )
+        classifier_scores_path = hf_hub_download(
+            repo_id=POOL_REPO_ID,
+            filename="classifier_scores.parquet",
+            repo_type="dataset",
+            token=RATINGS_APP_TOKEN,
+        )
+        validation_set_path = hf_hub_download(
+            repo_id=POOL_REPO_ID,
+            filename="validation_set.parquet",
+            repo_type="dataset",
+            token=RATINGS_APP_TOKEN,
+        )
+        validation_df = pd.read_parquet(
+            validation_set_path,
+            columns=["group", "id", "md5", "rating", "sample_url", "image_url"],
+        )
+        classifier_scores_df = pd.read_parquet(classifier_scores_path)
+        assert {"classifier", "md5", "classifier_score", "percentile"}.issubset(classifier_scores_df.columns), "classifier_scores.parquet missing expected columns"
+        classifier_scores_df = classifier_scores_df[["classifier", "md5", "classifier_score", "percentile"]]
+        classifier_scores_df["classifier"] = classifier_scores_df["classifier"].astype(str)
+        classifier_scores_df["md5"] = classifier_scores_df["md5"].astype(str)
+        validation_df["md5"] = validation_df["md5"].astype(str)
+        _explorer_df = validation_df.merge(classifier_scores_df, on="md5", how="left", validate="one_to_many")
+        _stats_last_loaded_at = now
+_reload_stats_if_due(force=True)
+def _pool_fetch_pair(group_name: str) -> tuple:
+    gdf = _pool_group_dfs[group_name]
+    assert len(gdf) >= 2, f"Not enough rows for group: {group_name}"
+    md5_keys = gdf["md5"].astype(str)
+    wins = md5_keys.map(lambda k: _stats_by_key.get(k, (0, 0))[0])
+    losses = md5_keys.map(lambda k: _stats_by_key.get(k, (0, 0))[1])
+    def _pick_from_mask(mask: pd.Series):
+        candidate_df = gdf[mask]
+        if len(candidate_df) < 2:
+            return None
+        sample = candidate_df.sample(2, replace=False)
+        return sample.iloc[0], sample.iloc[1]
+    # 1) Repeat the lowest-margin edge participating in a cycle. (To prevent deadlock, stop if all margins are 4+.)
+    #   a) If deadlocked on a cycle with 4+ images and no inner cycles, sample a random missing edge inside the cycle.
+    # 2) Pair images that both have wins only . (One of them will lose/tie. Stop when there is only one left.)
+    # 3) Pair images that both have losses only. (One of them will win/tie. Stop when there is only one left.)
+    # 4) Pair images with only 2 edges.
+    # 5) X% chance, re-sample an existing edge, inversely proportional to existing number of samples.
+    # 6) Y% chance, sample a random missing edge between images already sampled.
+    # 7) Pair an unsampled image with a random sampled image.
+    # 2) Pair images that currently have wins-only records.
+    picked = _pick_from_mask((wins > 0) & (losses == 0))
+    if picked is not None:
+        return picked[0], picked[1], "wins-only"
+    # 3) Pair images that currently have losses-only records.
+    picked = _pick_from_mask((wins == 0) & (losses > 0))
+    if picked is not None:
+        return picked[0], picked[1], "losses-only"
+    # 4) Pair images that currently have exactly 2 total edges.
+    vote_totals = wins + losses
+    picked = _pick_from_mask(vote_totals == 2)
+    if picked is not None:
+        return picked[0], picked[1], "total_votes=2"
+    # 7) Prefer pairing an unsampled image with a random previously sampled image.
+    unsampled_mask = vote_totals == 0
+    if unsampled_mask.any():
+        unsampled_row = gdf[unsampled_mask].sample(1).iloc[0]
+        sampled_df = gdf[~unsampled_mask]
+        if len(sampled_df) >= 1:
+            sampled_row = sampled_df.sample(1).iloc[0]
+        else:
+            sampled_row = gdf.drop(index=unsampled_row.name).sample(1).iloc[0]
+        return unsampled_row, sampled_row, "unsampled+sampled"
+    # 8) Safety fall back to low-vote weighted sampling.
+    sample_weights = 1.0 / (vote_totals + 1.0)
+    sample = gdf.sample(2, weights=sample_weights, replace=False)
+    return sample.iloc[0], sample.iloc[1], "low-vote"
 def _row_image_url(row) -> str:
     sample_url = row.get("sample_url")
         "fetch_pair": _pool_fetch_pair,
         "get_id": lambda row: row["md5"],
         "get_image": _row_image_url,
+        "groups": {g: g for g in sorted(_pool_df["group"].unique())},
     },
 }
 DEFAULT_DATASET = list(DATASETS.keys())[0]
+def _select_groups(cfg: dict, rating_pref: str) -> list[str]:
+    groups = list(cfg["groups"].keys())
+    if rating_pref == "all":
+        return groups
+    return [g for g in groups if g.endswith(f"_{rating_pref}")]
+def _commit_oldest_pending(state: dict):
+    pending = state.setdefault("pending", [])
+    if len(pending) <= 1:
+        return
+    oldest = pending.pop(0)
+    if oldest.get("winner") in ("A", "B"):
+        _apply_local_stats_update(oldest["winner"], oldest["key_a"], oldest["key_b"])
+    threading.Thread(target=VOTE_STORAGE.append_vote_row, args=(oldest.copy(), oldest.get("winner")), daemon=True).start()
+def _apply_local_stats_update(winner: str, key_a: str, key_b: str):
+    assert winner in ("A", "B")
+    with _stats_lock:
+        wins_a, losses_a = _stats_by_key.get(str(key_a), (0, 0))
+        wins_b, losses_b = _stats_by_key.get(str(key_b), (0, 0))
+        if winner == "A":
+            _stats_by_key[str(key_a)] = (wins_a + 1, losses_a)
+            _stats_by_key[str(key_b)] = (wins_b, losses_b + 1)
+        else:
+            _stats_by_key[str(key_a)] = (wins_a, losses_a + 1)
+            _stats_by_key[str(key_b)] = (wins_b + 1, losses_b)
+def _format_rating_post_row(post_id: int, wins: int, losses: int, label: str | None = None) -> str:
+    total_votes = wins + losses
+    url = f"https://e621.net/posts/{post_id}"
+    row = f"{url} | Times rated: {total_votes}"
+    return f"{label}: {row}" if label else row
 def _render_current(state: dict, submit_status: str = "") -> tuple:
+    _reload_stats_if_due()
+    wins_a, losses_a = _stats_by_key.get(str(state["key_a"]), (0, 0))
+    wins_b, losses_b = _stats_by_key.get(str(state["key_b"]), (0, 0))
+    title_a = "Image A"
+    title_b = "Image B"
+    img_a_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\"><strong>{html.escape(title_a)}</strong></div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_a'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
+    img_b_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\"><strong>{html.escape(title_b)}</strong></div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_b'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
+    link_a = _format_rating_post_row(state["id_a"], wins_a, losses_a, label="Image A")
+    link_b = _format_rating_post_row(state["id_b"], wins_b, losses_b, label="Image B")
+    can_go_back = bool(state.get("can_go_back"))
+    back_md = "[Undo Rating (Ctrl+z)](#back)" if can_go_back else "<span class='subtle-back-link-disabled'>Undo Rating (Ctrl+z)</span>"
+    group_md = f"<span class='subtle-note'>Group: {state['group']}</span>"
+    pair_reason = state.get("pair_reason", "")
+    pair_reason_md = f"<span class='subtle-note'>Pair: {html.escape(pair_reason)}</span>" if pair_reason else ""
+    status_md = f"<span class='submit-status-msg'>{html.escape(submit_status)}</span>" if submit_status else ""
+    return img_a_html, img_b_html, link_a, link_b, back_md, group_md, pair_reason_md, status_md, state
 def _normalize_rating_pref(pref: str | None) -> str:
     return pref if pref in ("safe", "all") else "safe"
+def _initial_load(state: dict, pref: str | None, submit_key: str | None):
+    rating_pref = _normalize_rating_pref(pref)
     submit_key = _normalize_submit_key(submit_key)
+    return rating_pref, submit_key, *new_round(DEFAULT_DATASET, rating_pref, state)
+def _on_rating_change(rating_pref: str, state: dict):
+    rating_pref = _normalize_rating_pref(rating_pref)
+    return *new_round(DEFAULT_DATASET, rating_pref, state), rating_pref
 def _normalize_submit_key(submit_key: str | None) -> str:
+    return submit_key or ""
 def _filtered_explorer_df(rating_pref: str) -> pd.DataFrame:
     return _filtered_explorer_df_by_classifier(rating_pref, ALLOWED_CLASSIFIER_FILTERS[0])
 def _filtered_explorer_df_by_classifier(rating_pref: str, classifier_name: str) -> pd.DataFrame:
     if rating_pref == "all":
         rating_filtered = _explorer_df
     assert classifier_name in ALLOWED_CLASSIFIER_FILTERS, f"Unsupported classifier filter: {classifier_name}"
     return rating_filtered[rating_filtered["classifier"] == classifier_name]
+def _load_results(rating_pref_value: str, sort_mode_value: str, classifier_filter_value: str):
+    rating_pref = _normalize_rating_pref(rating_pref_value)
+    sort_mode = _normalize_sort_mode(sort_mode_value)
+    classifier_name = _normalize_classifier_filter(classifier_filter_value)
+    _reload_stats_if_due()
     filtered_explorer_df = _filtered_explorer_df_by_classifier(rating_pref, classifier_name)
     summary, score_distribution_plot, distribution_data, gallery_items, page_meta, next_offset, btn_update = build_results_data(
         filtered_explorer_df,
     )
     return summary, score_distribution_plot, distribution_data, gallery_items, btn_update, "Click an image to reveal its ID and link.", page_meta, next_offset
 def _normalize_sort_mode(sort_mode: str | None) -> str:
     if sort_mode in ("Default", "Rating: Low to High", "Rating: High to Low"):
         return sort_mode
     return "Default"
 def _normalize_classifier_filter(classifier_name: str | None) -> str:
     if classifier_name in ALLOWED_CLASSIFIER_FILTERS:
         return str(classifier_name)
 # -- Gradio callbacks -------------------------------------------------------
+def new_round(dataset_name: str, rating_pref: str, state: dict) -> tuple:
     cfg = DATASETS[dataset_name]
+    groups = _select_groups(cfg, rating_pref)
+    assert groups, f"No groups for rating preference: {rating_pref}"
     group = random.choice(groups)
+    pair_data = cfg["fetch_pair"](cfg["groups"][group])
+    if len(pair_data) == 3:
+        row_a, row_b, pair_reason = pair_data
+    else:
+        row_a, row_b = pair_data
+        pair_reason = ""
     state.setdefault("session_id", uuid.uuid4().hex)
     key_a = cfg["get_id"](row_a)
     key_b = cfg["get_id"](row_b)
     id_a = int(row_a["id"])
     id_b = int(row_b["id"])
+    state.update(dataset=dataset_name, rating_pref=rating_pref, key_a=key_a, key_b=key_b, id_a=id_a, id_b=id_b, group=group, pair_reason=pair_reason)
     url_a = cfg["get_image"](row_a)
     url_b = cfg["get_image"](row_b)
     state["url_a"] = url_a
 def _queue_decision(winner: str | None, state: dict):
     assert state.get("session_id"), "Missing session_id: refusing to record vote"
+    state.setdefault("pending", [])
+    decision = {
         "winner": winner,
         "key_a": state["key_a"],
         "key_b": state["key_b"],
         "url_a": state["url_a"],
         "url_b": state["url_b"],
         "dataset": state["dataset"],
+        "rating_pref": state["rating_pref"],
         "group": state["group"],
         "pair_reason": state.get("pair_reason", ""),
         "session_id": state["session_id"],
+    }
+    state["pending"].append(decision)
+    state["last_decision"] = decision
+    state["can_go_back"] = True
+    _commit_oldest_pending(state)
+def vote(winner: str | None, state: dict, submit_key: str | None) -> tuple:
+    assert winner in ("A", "B", None)
     if _normalize_submit_key(submit_key) != SUBMIT_KEY:
         return _render_current(state, "Wrong submission key.")
     _queue_decision(winner, state)
+    return new_round(state["dataset"], state["rating_pref"], state)
 def go_back(state: dict) -> tuple:
     pending = state.setdefault("pending", [])
+    if not state.get("can_go_back"):
+        return _render_current(state)
+    last = state.get("last_decision")
+    if not last:
+        state["can_go_back"] = False
+        return _render_current(state)
+    if pending and pending[-1] == last:
+        pending.pop()
+    state["can_go_back"] = False
+    state["last_decision"] = None
+    state.update(
+        dataset=last["dataset"],
+        rating_pref=last["rating_pref"],
+        key_a=last["key_a"],
+        key_b=last["key_b"],
+        id_a=last["id_a"],
+        id_b=last["id_b"],
+        url_a=last["url_a"],
+        url_b=last["url_b"],
+        group=last["group"],
+        pair_reason=last.get("pair_reason", ""),
+    )
     return _render_current(state)
 # -- UI ---------------------------------------------------------------------
 with gr.Blocks(
+    title="Image Rater",
     head="""
     <script>
     const VOTE_COOLDOWN_MS = 1500;
         } else if ((e.key === 'z' || e.key === 'Z') && (e.ctrlKey || e.metaKey) && ratingTabActive) {
             e.preventDefault();
             backButton?.click();
+        } else if (e.key === 'ArrowDown' && resultsTabActive) {
+            e.preventDefault();
+            resultsLoadMoreButton?.click();
         }
     });
     document.addEventListener('click', function (e) {
     }
     .rating-image-frame {
         width: 100%;
+        height: 512px;
         border: 1px solid #e6e6e6;
         border-radius: 8px;
         background: #333;
         object-fit: contain !important;
         background: #1f2937;
     }
     """,
 ) as demo:
     state = gr.State({})
     rating_pref_store = gr.BrowserState(default_value="safe", storage_key="rating_pref")
     submit_key_store = gr.BrowserState(default_value="", storage_key="submit_key")
     results_sort_store = gr.BrowserState(default_value="Default", storage_key="results_sort_mode")
     results_classifier_store = gr.BrowserState(default_value=ALLOWED_CLASSIFIER_FILTERS[0], storage_key="results_classifier")
     with gr.Tabs():
         with gr.Tab("Image Quality Rater"):
                 img_a = gr.HTML(elem_id="img-a")
                 img_b = gr.HTML(elem_id="img-b")
+            with gr.Row():
+                btn_a = gr.Button("👍 Prefer A", variant="primary", elem_id="btn-vote-a")
+                btn_skip = gr.Button("Same quality", elem_id="btn-skip")
+                btn_b = gr.Button("👍 Prefer B", variant="primary", elem_id="btn-vote-b")
             with gr.Accordion("Settings", open=False):
+                gr.Markdown("<span style='color:#888;font-size:0.9em;'>Advanced options</span>")
+                rating_dd = gr.Dropdown(
+                    choices=["safe", "all"],
+                    value="safe",
+                    label="Rating",
+                    elem_id="rating-pref",
                 )
                 submit_key_tb = gr.Textbox(
                     value="",
                     type="password",
+                    label="Submit key",
                     elem_id="submit-key",
                 )
+            link_a = gr.Markdown(label="Image A link")
+            link_b = gr.Markdown(label="Image B link")
+            back_link = gr.Markdown(elem_classes=["subtle-back-link-wrap"])
+            btn_back_action = gr.Button("Undo Rating (Ctrl+z)", elem_id="btn-back-action", elem_classes=["hidden-action-btn"])
+            details_md = gr.Markdown()
+            pair_reason_md = gr.Markdown()
+            submit_status_md = gr.Markdown(elem_id="submit-status")
+            gr.Markdown("<span class='subtle-note'>Dataset: <a href='https://huggingface.co/datasets/taigasan/e6-visual-ratings' target='_blank' rel='noopener noreferrer'>taigasan/e6-visual-ratings</a></span>")
+            gr.Markdown("<span class='subtle-note'>Keyboard Shortcuts: ⬅️ vote A, ⬆️ same quality, ➡️ vote B, Ctrl+z undo rating</span>")
         (
             results_summary_md,
             results_sort_dd,
             results_classifier_dd,
             results_score_distribution_plot,
             results_page_offset_state,
         ) = add_results_tab(_pool_df)
+    outputs = [img_a, img_b, link_a, link_b, back_link, details_md, pair_reason_md, submit_status_md, state]
     results_outputs = [
         results_summary_md,
         results_score_distribution_plot,
         results_page_offset_state,
     ]
+    btn_a.click(fn=lambda s, k: vote("A", s, k), inputs=[state, submit_key_store], outputs=outputs, queue=False, show_progress="hidden")
+    btn_b.click(fn=lambda s, k: vote("B", s, k), inputs=[state, submit_key_store], outputs=outputs, queue=False, show_progress="hidden")
+    btn_skip.click(fn=lambda s, k: vote(None, s, k), inputs=[state, submit_key_store], outputs=outputs, queue=False, show_progress="hidden")
     btn_back_action.click(fn=go_back, inputs=[state], outputs=outputs, queue=False, show_progress="hidden")
+    rating_dd.change(fn=_on_rating_change, inputs=[rating_dd, state], outputs=[*outputs, rating_pref_store], queue=False, show_progress="hidden")
+    submit_key_tb.input(fn=_normalize_submit_key, inputs=[submit_key_tb], outputs=[submit_key_store], queue=False, show_progress="hidden")
     submit_key_tb.change(fn=_normalize_submit_key, inputs=[submit_key_tb], outputs=[submit_key_store], queue=False, show_progress="hidden")
+    rating_dd.change(fn=_load_results, inputs=[rating_dd, results_sort_store, results_classifier_store], outputs=results_outputs, queue=False, show_progress="hidden")
     results_sort_dd.change(fn=_normalize_sort_mode, inputs=[results_sort_dd], outputs=[results_sort_store], queue=False, show_progress="hidden")
     results_sort_dd.change(fn=_load_results, inputs=[rating_pref_store, results_sort_dd, results_classifier_store], outputs=results_outputs, queue=False, show_progress="hidden")
     results_classifier_dd.change(fn=_normalize_classifier_filter, inputs=[results_classifier_dd], outputs=[results_classifier_store], queue=False, show_progress="hidden")
     results_classifier_dd.change(fn=_load_results, inputs=[rating_pref_store, results_sort_store, results_classifier_dd], outputs=results_outputs, queue=False, show_progress="hidden")
+    demo.load(fn=_initial_load, inputs=[state, rating_pref_store, submit_key_store], outputs=[rating_dd, submit_key_tb, *outputs], queue=False, show_progress="hidden")
     demo.load(fn=_load_results, inputs=[rating_pref_store, results_sort_store, results_classifier_store], outputs=results_outputs, queue=False, show_progress="hidden")
     demo.load(fn=_normalize_sort_mode, inputs=[results_sort_store], outputs=[results_sort_dd], queue=False, show_progress="hidden")
     demo.load(fn=_normalize_classifier_filter, inputs=[results_classifier_store], outputs=[results_classifier_dd], queue=False, show_progress="hidden")
     results_load_more_btn.click(
         fn=lambda r, s, c, o: load_more_results(_filtered_explorer_df_by_classifier(_normalize_rating_pref(r), _normalize_classifier_filter(c)), _explorer_df, s, o),
         inputs=[rating_pref_store, results_sort_store, results_classifier_store, results_page_offset_state],

compact_logs.py CHANGED Viewed

@@ -4,6 +4,8 @@ from __future__ import annotations
 import os
 import time
 import uuid
 import pandas as pd
 from huggingface_hub import CommitOperationAdd, CommitOperationDelete, HfApi, hf_hub_download
@@ -13,6 +15,7 @@ VOTES_REPO_TYPE = "dataset"
 VOTES_LOG_SUBDIR = "ratings_log"
 RATINGS_APP_TOKEN_ENV = "RATINGS_APP_TOKEN"
 def _list_vote_shards(api: HfApi) -> list[str]:
     files = api.list_repo_files(repo_id=VOTES_REPO_ID, repo_type=VOTES_REPO_TYPE)
     shard_prefix = f"{VOTES_LOG_SUBDIR}/votes_"
@@ -23,19 +26,21 @@ def _list_vote_shards(api: HfApi) -> list[str]:
         and f.endswith(".parquet")
     )
 def _new_compacted_shard_path() -> str:
     ts = int(time.time())
     return f"{VOTES_LOG_SUBDIR}/votes_{ts}_{uuid.uuid4().hex}.parquet"
-def compact_votes() -> tuple[int, int, str] | None:
     token = os.getenv(RATINGS_APP_TOKEN_ENV)
     api = HfApi(token=token)
     shards = _list_vote_shards(api)
-    if len(shards) < 2:
-        return None
-    frames: list[pd.DataFrame] = []
     for shard in shards:
         shard_local = hf_hub_download(
             repo_id=VOTES_REPO_ID,
@@ -43,39 +48,47 @@ def compact_votes() -> tuple[int, int, str] | None:
             repo_type=VOTES_REPO_TYPE,
             token=token,
         )
-        frames.append(pd.read_parquet(shard_local))
     combined = pd.concat(frames, ignore_index=True, sort=False)
-    output_row_count = len(combined)
-    assert len(combined) == sum(len(frame) for frame in frames)
-    compacted_data = combined.to_parquet(index=False)
-    compacted_path = _new_compacted_shard_path()
-    api.create_commit(
-        repo_id=VOTES_REPO_ID,
-        repo_type=VOTES_REPO_TYPE,
-        commit_message=f"compact {len(shards)} vote shards",
-        operations=[
-            CommitOperationAdd(path_or_fileobj=compacted_data, path_in_repo=compacted_path),
-            *(CommitOperationDelete(path_in_repo=shard) for shard in shards),
-        ],
-    )
-    return len(shards), len(combined), compacted_path
-def _main() -> None:
-    result = compact_votes()
-    if result is None:
-        print(f"Nothing to compact.")
-        return
-    shard_count, row_count, compacted_path = result
     print(
-        f"Compacted {shard_count} shards into "
         f"{VOTES_REPO_ID}/{compacted_path} "
         f"with {row_count} rows."
     )
 if __name__ == "__main__":
-    _main()

 import os
 import time
 import uuid
+from pathlib import Path
+from tempfile import NamedTemporaryFile
 import pandas as pd
 from huggingface_hub import CommitOperationAdd, CommitOperationDelete, HfApi, hf_hub_download
 VOTES_LOG_SUBDIR = "ratings_log"
 RATINGS_APP_TOKEN_ENV = "RATINGS_APP_TOKEN"
 def _list_vote_shards(api: HfApi) -> list[str]:
     files = api.list_repo_files(repo_id=VOTES_REPO_ID, repo_type=VOTES_REPO_TYPE)
     shard_prefix = f"{VOTES_LOG_SUBDIR}/votes_"
         and f.endswith(".parquet")
     )
 def _new_compacted_shard_path() -> str:
     ts = int(time.time())
     return f"{VOTES_LOG_SUBDIR}/votes_{ts}_{uuid.uuid4().hex}.parquet"
+def compact_votes() -> tuple[int, int, str]:
     token = os.getenv(RATINGS_APP_TOKEN_ENV)
     api = HfApi(token=token)
     shards = _list_vote_shards(api)
+    if not shards:
+        raise FileNotFoundError(f"No vote shards found in {VOTES_REPO_ID}/{VOTES_LOG_SUBDIR}")
+    frames = []
     for shard in shards:
         shard_local = hf_hub_download(
             repo_id=VOTES_REPO_ID,
             repo_type=VOTES_REPO_TYPE,
             token=token,
         )
+        frame = pd.read_parquet(shard_local)
+        frames.append(frame)
+    input_row_count = sum(len(frame) for frame in frames)
     combined = pd.concat(frames, ignore_index=True, sort=False)
+    output_row_count = int(len(combined))
+    if output_row_count != input_row_count:
+        raise RuntimeError(
+            f"Refusing to commit: row mismatch during compaction "
+            f"({input_row_count} -> {output_row_count})."
+        )
+    with NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+        tmp_path = Path(tmp.name)
+    try:
+        combined.to_parquet(tmp_path, index=False)
+        compacted_path = _new_compacted_shard_path()
+        operations = [
+            CommitOperationAdd(path_or_fileobj=str(tmp_path), path_in_repo=compacted_path),
+            *[CommitOperationDelete(path_in_repo=shard) for shard in shards],
+        ]
+        api.create_commit(
+            repo_id=VOTES_REPO_ID,
+            repo_type=VOTES_REPO_TYPE,
+            commit_message=f"compact {len(shards)} vote shard(s)",
+            operations=operations,
+        )
+    finally:
+        tmp_path.unlink(missing_ok=True)
+    return len(shards), output_row_count, compacted_path
+def main() -> None:
+    shard_count, row_count, compacted_path = compact_votes()
     print(
+        f"Compacted {shard_count} shard(s) into "
         f"{VOTES_REPO_ID}/{compacted_path} "
         f"with {row_count} rows."
     )
 if __name__ == "__main__":
+    main()

explorer.py CHANGED Viewed

@@ -208,11 +208,6 @@ def add_results_tab(pool_df: pd.DataFrame):
         results_load_more_btn = gr.Button("Load more (ArrowDown)", elem_id="btn-results-load-more")
         selected_image_md = gr.Markdown("Click an image to reveal its ID and link.")
         results_score_distribution_plot = gr.Plot(label="Classifier score distribution")
-        results_rating_dd = gr.Dropdown(
-            choices=["safe", "all"],
-            value="safe",
-            label="Rating",
-        )
         results_sort_dd = gr.Dropdown(
             choices=SORT_MODES,
             value="Default",
@@ -230,7 +225,6 @@ def add_results_tab(pool_df: pd.DataFrame):
         results_page_offset_state = gr.State(0)
     return (
         results_summary_md,
-        results_rating_dd,
         results_sort_dd,
         results_classifier_dd,
         results_score_distribution_plot,

         results_load_more_btn = gr.Button("Load more (ArrowDown)", elem_id="btn-results-load-more")
         selected_image_md = gr.Markdown("Click an image to reveal its ID and link.")
         results_score_distribution_plot = gr.Plot(label="Classifier score distribution")
         results_sort_dd = gr.Dropdown(
             choices=SORT_MODES,
             value="Default",
         results_page_offset_state = gr.State(0)
     return (
         results_summary_md,
         results_sort_dd,
         results_classifier_dd,
         results_score_distribution_plot,

stats_from_logs.py CHANGED Viewed

@@ -1,24 +1,11 @@
-from collections import defaultdict
-from dataclasses import dataclass
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download
-VOTES_LOG_SUBDIR = "ratings_log"
-@dataclass(slots=True)
-class Stats:
-    wins: int = 0
-    losses: int = 0
-    ties: int = 0
-    @property
-    def votes(self) -> int:
-        return self.wins + self.losses + self.ties
-    @property
-    def winrate(self) -> float:
-        return (self.wins + self.ties * 0.5) / max(self.votes, 1)
 def _list_remote_log_files(repo_id: str, token: str | None) -> list[str]:
     api = HfApi(token=token)
@@ -29,8 +16,32 @@ def _list_remote_log_files(repo_id: str, token: str | None) -> list[str]:
         if f.startswith(f"{VOTES_LOG_SUBDIR}/") and f.endswith(".parquet")
     )
-def load_stats_by_md5(*, repo_id: str, token: str | None) -> dict[str, Stats]:
-    stats: dict[str, Stats] = defaultdict(Stats)
     for path_in_repo in _list_remote_log_files(repo_id, token):
         local_path = hf_hub_download(
@@ -39,17 +50,8 @@ def load_stats_by_md5(*, repo_id: str, token: str | None) -> dict[str, Stats]:
             repo_type="dataset",
             token=token,
         )
         df = pd.read_parquet(local_path, columns=["md5a", "md5b", "winner_md5"])
-        for md5a, md5b, winner_md5 in df.itertuples(index=False):
-            if winner_md5 == md5a:
-                stats[md5a].wins += 1
-                stats[md5b].losses += 1
-            elif winner_md5 == md5b:
-                stats[md5b].wins += 1
-                stats[md5a].losses += 1
-            else:
-                stats[md5a].ties += 1
-                stats[md5b].ties += 1
-    return stats

+from collections import Counter
 import pandas as pd
 from huggingface_hub import HfApi, hf_hub_download
+VOTES_LOG_SUBDIR = "ratings_log"
 def _list_remote_log_files(repo_id: str, token: str | None) -> list[str]:
     api = HfApi(token=token)
         if f.startswith(f"{VOTES_LOG_SUBDIR}/") and f.endswith(".parquet")
     )
+def _accumulate_stats_from_df(df: pd.DataFrame, wins_counter: Counter, losses_counter: Counter):
+    if df.empty:
+        return
+    valid = df[df["winner_md5"].notna()].copy()
+    if valid.empty:
+        return
+    valid["md5a"] = valid["md5a"].astype(str)
+    valid["md5b"] = valid["md5b"].astype(str)
+    valid["winner_md5"] = valid["winner_md5"].astype(str)
+    a_won_mask = valid["winner_md5"] == valid["md5a"]
+    b_won_mask = valid["winner_md5"] == valid["md5b"]
+    winner_keys = pd.concat([valid.loc[a_won_mask, "md5a"], valid.loc[b_won_mask, "md5b"]], ignore_index=True)
+    loser_keys = pd.concat([valid.loc[a_won_mask, "md5b"], valid.loc[b_won_mask, "md5a"]], ignore_index=True)
+    for key, count in winner_keys.value_counts().items():
+        wins_counter[str(key)] += int(count)
+    for key, count in loser_keys.value_counts().items():
+        losses_counter[str(key)] += int(count)
+def load_stats_by_md5(*, repo_id: str, token: str | None) -> dict[str, tuple[int, int]]:
+    wins_counter: Counter[str] = Counter()
+    losses_counter: Counter[str] = Counter()
     for path_in_repo in _list_remote_log_files(repo_id, token):
         local_path = hf_hub_download(
             repo_type="dataset",
             token=token,
         )
         df = pd.read_parquet(local_path, columns=["md5a", "md5b", "winner_md5"])
+        _accumulate_stats_from_df(df, wins_counter, losses_counter)
+    all_keys = set(wins_counter) | set(losses_counter)
+    return {k: (int(wins_counter.get(k, 0)), int(losses_counter.get(k, 0))) for k in all_keys}

storage.py CHANGED Viewed

@@ -27,120 +27,82 @@ VOTE_COLUMNS = [
 class VoteStorage:
-    def __init__(self, mode: str, token: str | None = None) -> None:
         assert mode in ("hf", "void"), f"Unsupported storage mode: {mode}"
         self.mode = mode
         is_debug_mode = self.mode == "void"
         self._flush_every = 3 if is_debug_mode else 50
         self._flush_interval_sec = 15.0 if is_debug_mode else 300.0
-        self._hf_api = HfApi(token=token)
-        self._flush_condition = threading.Condition(threading.Lock())
-        self._sync_event = threading.Event()
-        self._sync_lock = threading.Lock()
         self._votes_buffer: list[dict] = []
-        self._shutdown = False
         self._flush_thread = threading.Thread(target=self._flush_loop, daemon=True)
         self._flush_thread.start()
         atexit.register(self.close)
-    def _upload_votes_batch(self, batch: list[dict]) -> None:
-        assert batch
-        if self.mode == "void":
-            return
-        df = pd.DataFrame(batch)
-        for col in VOTE_COLUMNS:
-            if col not in df.columns:
-                df[col] = None
-        df = df[VOTE_COLUMNS]
         ts = int(time.time())
         shard = f"votes_{ts}_{uuid.uuid4().hex}.parquet"
-        self._hf_api.upload_file(
-            path_or_fileobj=df.to_parquet(index=False),
-            path_in_repo=f"{VOTES_LOG_SUBDIR}/{shard}",
-            repo_id=VOTES_REPO_ID,
-            repo_type=VOTES_REPO_TYPE,
-            commit_message=f"upload {len(df)} vote rows",
-        )
-    def _flush_loop(self) -> None:
-        while True:
-            with self._flush_condition:
-                while True:
-                    # Forced sync.
-                    if not self._sync_event.is_set():
-                        if self._votes_buffer:
-                            break
-                        self._sync_event.set()
-                    # Shutdown wanted.
-                    if self._shutdown:
-                        if self._votes_buffer:
-                            break
-                        return
-                    # Have enough votes to flush now.
-                    if len(self._votes_buffer) >= self._flush_every:
-                        break
-                    # Wait for a notify to flush early or shutdown.
-                    if not self._flush_condition.wait(self._flush_interval_sec):
-                        # Interval elapsed. Flush if there is at least one vote.
-                        if self._votes_buffer:
-                            break
-                # Atomically take the batch of votes.
-                batch = self._votes_buffer
-                self._votes_buffer = []
-            self._upload_votes_batch(batch)
-    def sync(self) -> None:
-        with self._sync_lock:
-            with self._flush_condition:
-                is_shutdown = self._shutdown
-                if not is_shutdown:
-                    self._sync_event.clear()
-                    self._flush_condition.notify()
-            if not is_shutdown:
-                self._sync_event.wait()
-        if is_shutdown:
-            self._flush_thread.join()
-    def close(self) -> None:
-        with self._flush_condition:
-            self._shutdown = True
-            self._flush_condition.notify()
-        self._flush_thread.join()
-    def queue_row(self, state: dict) -> None:
         id_a = int(state["id_a"])
         id_b = int(state["id_b"])
-        winner_md5: str | None
-        match state["winner"]:
-            case "A":
-                winner_md5 = state["key_a"]
-            case "B":
-                winner_md5 = state["key_b"]
-            case None:
-                winner_md5 = None
-            case _:
-                raise AssertionError
         vote_row = {
             "vote_id": uuid.uuid4().hex,
             "timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"),
@@ -153,9 +115,6 @@ class VoteStorage:
             "group": state["group"],
             "session_id": state["session_id"],
         }
-        with self._flush_condition:
             self._votes_buffer.append(vote_row)
-            if len(self._votes_buffer) == self._flush_every:
-                self._flush_condition.notify()

 class VoteStorage:
+    def __init__(self, mode: str, token: str | None = None):
         assert mode in ("hf", "void"), f"Unsupported storage mode: {mode}"
         self.mode = mode
+        self._token = token
         is_debug_mode = self.mode == "void"
         self._flush_every = 3 if is_debug_mode else 50
         self._flush_interval_sec = 15.0 if is_debug_mode else 300.0
+        self._votes_lock = threading.Lock()
         self._votes_buffer: list[dict] = []
+        self._stop_event = threading.Event()
         self._flush_thread = threading.Thread(target=self._flush_loop, daemon=True)
         self._flush_thread.start()
         atexit.register(self.close)
+    def _hf_token(self) -> str | None:
+        return self._token
+    def _empty_votes_df(self) -> pd.DataFrame:
+        return pd.DataFrame(columns=VOTE_COLUMNS)
+    def _upload_votes_batch(self, df: pd.DataFrame, commit_message: str):
+        assert set(VOTE_COLUMNS).issubset(df.columns), "Missing vote columns in upload batch"
+        if self.mode == "void":
+            _ = commit_message
+            return
         ts = int(time.time())
         shard = f"votes_{ts}_{uuid.uuid4().hex}.parquet"
+        api = HfApi(token=self._hf_token())
+        with NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+            tmp_path = tmp.name
+        try:
+            df[VOTE_COLUMNS].to_parquet(tmp_path, index=False)
+            api.upload_file(
+                path_or_fileobj=tmp_path,
+                path_in_repo=f"{VOTES_LOG_SUBDIR}/{shard}",
+                repo_id=VOTES_REPO_ID,
+                repo_type=VOTES_REPO_TYPE,
+                commit_message=commit_message,
+            )
+        finally:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+    def _flush_votes(self, force: bool = False):
+        with self._votes_lock:
+            if not self._votes_buffer:
+                return
+            if not force and len(self._votes_buffer) < self._flush_every:
+                return
+            batch = list(self._votes_buffer)
+            self._votes_buffer.clear()
+        incoming = pd.DataFrame(batch)
+        for col in VOTE_COLUMNS:
+            if col not in incoming.columns:
+                incoming[col] = None
+        self._upload_votes_batch(incoming[VOTE_COLUMNS], commit_message=f"append {len(batch)} vote rows")
+    def _flush_loop(self):
+        while not self._stop_event.wait(self._flush_interval_sec):
+            self._flush_votes(force=True)
+    def close(self):
+        if self._stop_event.is_set():
+            return
+        self._stop_event.set()
+        self._flush_thread.join(timeout=1.0)
+        self._flush_votes(force=True)
+    def append_vote_row(self, state: dict, winner: str | None):
         id_a = int(state["id_a"])
         id_b = int(state["id_b"])
+        winner_md5 = None
+        if winner == "A":
+            winner_md5 = state["key_a"]
+        elif winner == "B":
+            winner_md5 = state["key_b"]
         vote_row = {
             "vote_id": uuid.uuid4().hex,
             "timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"),
             "group": state["group"],
             "session_id": state["session_id"],
         }
+        with self._votes_lock:
             self._votes_buffer.append(vote_row)
+        self._flush_votes()