Spaces:

RedRocket
/

e6-visual-ratings

Running

App Files Files Community

RedHotTensors commited on Apr 23

Commit

29de1ae

1 Parent(s): 2c95aa5

Synchronously flush votes before reloading stats to prevent out-of-date reloading.

Browse files

Files changed (2) hide show

app.py +47 -55
storage.py +39 -20

app.py CHANGED Viewed

@@ -40,70 +40,65 @@ WINRATE_LOC = _pool_df.columns.get_loc("winrate")
 _md5_to_idx = { md5: idx for idx, md5 in enumerate(_pool_df["md5"]) }
-_stats_lock = threading.Lock()
 _pool_lock = threading.Lock()
 _stats_last_loaded_at = 0.0
 _explorer_df = pd.DataFrame(columns=["group", "id", "md5", "rating", "sample_url", "image_url", "classifier", "classifier_score", "percentile"])
-def _reload_stats_if_due(force: bool = False):
-    global _stats_last_loaded_at,_explorer_df
-    now = time.time()
-    if not force and (now - _stats_last_loaded_at) < STATS_RELOAD_S:
-        return
-    with _stats_lock:
-        now = time.time()
-        if not force and (now - _stats_last_loaded_at) < STATS_RELOAD_S:
-            return
-        stats_by_key = load_stats_by_md5(
-            repo_id=POOL_REPO_ID,
-            token=RATINGS_APP_TOKEN,
-        )
-        n_missing = 0
-        with _pool_lock:
-            for md5, stats in stats_by_key.items():
-                if (idx := _md5_to_idx.get(md5)) is not None:
-                    _pool_df.iloc[idx, [WINS_LOC, LOSSES_LOC, TIES_LOC, VOTES_LOC, WINRATE_LOC]] = (
-                        stats.wins, stats.losses, stats.ties, stats.votes, stats.winrate
-                    )
-                else:
-                    n_missing += 1
-        if n_missing:
-            print(f"{n_missing} md5s have stats but are not in the pool!", file=sys.stderr)
-        classifier_scores_path = hf_hub_download(
-            repo_id=POOL_REPO_ID,
-            filename="classifier_scores.parquet",
-            repo_type="dataset",
-            token=RATINGS_APP_TOKEN,
-        )
-        validation_set_path = hf_hub_download(
-            repo_id=POOL_REPO_ID,
-            filename="validation_set.parquet",
-            repo_type="dataset",
-            token=RATINGS_APP_TOKEN,
-        )
-        validation_df = pd.read_parquet(
-            validation_set_path,
-            columns=["group", "id", "md5", "rating", "sample_url", "image_url"],
-        )
-        classifier_scores_df = pd.read_parquet(classifier_scores_path)
-        assert {"classifier", "md5", "classifier_score", "percentile"}.issubset(classifier_scores_df.columns), "classifier_scores.parquet missing expected columns"
-        classifier_scores_df = classifier_scores_df[["classifier", "md5", "classifier_score", "percentile"]]
-        classifier_scores_df["classifier"] = classifier_scores_df["classifier"].astype(str)
-        classifier_scores_df["md5"] = classifier_scores_df["md5"].astype(str)
-        validation_df["md5"] = validation_df["md5"].astype(str)
-        _explorer_df = validation_df.merge(classifier_scores_df, on="md5", how="left", validate="one_to_many")
-        _stats_last_loaded_at = now
-_reload_stats_if_due(force=True)
 def _pick_from(df: pd.DataFrame, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
@@ -247,8 +242,6 @@ def _format_rating_post_title(post_id: int, votes: int, label: str) -> str:
     return f"<strong>{label}</strong>: <a href=\"https://e621.net/posts/{post_id}\" target=\"_blank\" rel=\"noreferrer\">Post #{post_id}</a> | {votes} {'Vote' if votes == 1 else 'Votes'}"
 def _render_current(state: dict, submit_status: str = "") -> tuple:
-    _reload_stats_if_due()
     votes_a = _pool_df.iloc[_md5_to_idx[state["key_a"]], VOTES_LOC]
     votes_b = _pool_df.iloc[_md5_to_idx[state["key_b"]], VOTES_LOC]
     title_a = _format_rating_post_title(state["id_a"], votes_a, "Image A")
@@ -298,7 +291,6 @@ def _load_results(rating_pref_value: str, sort_mode_value: str, classifier_filte
     rating_pref = _normalize_rating_pref(rating_pref_value)
     sort_mode = _normalize_sort_mode(sort_mode_value)
     classifier_name = _normalize_classifier_filter(classifier_filter_value)
-    _reload_stats_if_due()
     filtered_explorer_df = _filtered_explorer_df_by_classifier(rating_pref, classifier_name)
     summary, score_distribution_plot, distribution_data, gallery_items, page_meta, next_offset, btn_update = build_results_data(
         filtered_explorer_df,

 _md5_to_idx = { md5: idx for idx, md5 in enumerate(_pool_df["md5"]) }
 _pool_lock = threading.Lock()
 _stats_last_loaded_at = 0.0
 _explorer_df = pd.DataFrame(columns=["group", "id", "md5", "rating", "sample_url", "image_url", "classifier", "classifier_score", "percentile"])
+def _load_stats() -> None:
+    VOTE_STORAGE.sync()
+    load_stats_by_md5(repo_id=POOL_REPO_ID, token=RATINGS_APP_TOKEN)
+    n_missing = 0
+    with _pool_lock:
+        VOTE_STORAGE.sync()
+        stats_by_key = load_stats_by_md5(repo_id=POOL_REPO_ID, token=RATINGS_APP_TOKEN)
+        for md5, stats in stats_by_key.items():
+            if (idx := _md5_to_idx.get(md5)) is not None:
+                _pool_df.iloc[idx, [WINS_LOC, LOSSES_LOC, TIES_LOC, VOTES_LOC, WINRATE_LOC]] = (
+                    stats.wins, stats.losses, stats.ties, stats.votes, stats.winrate
+                )
+            else:
+                n_missing += 1
+    if n_missing:
+        print(f"{n_missing} md5s have stats but are not in the pool!", file=sys.stderr)
+    classifier_scores_path = hf_hub_download(
+        repo_id=POOL_REPO_ID,
+        filename="classifier_scores.parquet",
+        repo_type="dataset",
+        token=RATINGS_APP_TOKEN,
+    )
+    validation_set_path = hf_hub_download(
+        repo_id=POOL_REPO_ID,
+        filename="validation_set.parquet",
+        repo_type="dataset",
+        token=RATINGS_APP_TOKEN,
+    )
+    validation_df = pd.read_parquet(
+        validation_set_path,
+        columns=["group", "id", "md5", "rating", "sample_url", "image_url"],
+    )
+    classifier_scores_df = pd.read_parquet(classifier_scores_path)
+    assert {"classifier", "md5", "classifier_score", "percentile"}.issubset(classifier_scores_df.columns), "classifier_scores.parquet missing expected columns"
+    classifier_scores_df = classifier_scores_df[["classifier", "md5", "classifier_score", "percentile"]]
+    classifier_scores_df["classifier"] = classifier_scores_df["classifier"].astype(str)
+    classifier_scores_df["md5"] = classifier_scores_df["md5"].astype(str)
+    validation_df["md5"] = validation_df["md5"].astype(str)
+    global _explorer_df
+    _explorer_df = validation_df.merge(classifier_scores_df, on="md5", how="left", validate="one_to_many")
+def _stats_reloader() -> None:
+    while True:
+        time.sleep(STATS_RELOAD_S)
+        _load_stats()
+_load_stats()
+threading.Thread(target=_stats_reloader, daemon=True).start()
 def _pick_from(df: pd.DataFrame, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
     return f"<strong>{label}</strong>: <a href=\"https://e621.net/posts/{post_id}\" target=\"_blank\" rel=\"noreferrer\">Post #{post_id}</a> | {votes} {'Vote' if votes == 1 else 'Votes'}"
 def _render_current(state: dict, submit_status: str = "") -> tuple:
     votes_a = _pool_df.iloc[_md5_to_idx[state["key_a"]], VOTES_LOC]
     votes_b = _pool_df.iloc[_md5_to_idx[state["key_b"]], VOTES_LOC]
     title_a = _format_rating_post_title(state["id_a"], votes_a, "Image A")
     rating_pref = _normalize_rating_pref(rating_pref_value)
     sort_mode = _normalize_sort_mode(sort_mode_value)
     classifier_name = _normalize_classifier_filter(classifier_filter_value)
     filtered_explorer_df = _filtered_explorer_df_by_classifier(rating_pref, classifier_name)
     summary, score_distribution_plot, distribution_data, gallery_items, page_meta, next_offset, btn_update = build_results_data(
         filtered_explorer_df,

storage.py CHANGED Viewed

@@ -27,49 +27,63 @@ VOTE_COLUMNS = [
 class VoteStorage:
-    def __init__(self, mode: str, token: str | None = None):
         assert mode in ("hf", "void"), f"Unsupported storage mode: {mode}"
         self.mode = mode
         is_debug_mode = self.mode == "void"
         self._flush_every = 3 if is_debug_mode else 50
         self._flush_interval_sec = 15.0 if is_debug_mode else 300.0
-        self._shutdown = False
         self._votes_buffer: list[dict] = []
-        self._flush_condition = threading.Condition(threading.Lock())
         self._flush_thread = threading.Thread(target=self._flush_loop, daemon=True)
         self._flush_thread.start()
-        self.hf_api = HfApi(token=token)
         atexit.register(self.close)
-    def _empty_votes_df(self) -> pd.DataFrame:
-        return pd.DataFrame(columns=VOTE_COLUMNS)
-    def _upload_votes_batch(self, df: pd.DataFrame, commit_message: str):
         if self.mode == "void":
             return
         ts = int(time.time())
         shard = f"votes_{ts}_{uuid.uuid4().hex}.parquet"
-        self.hf_api.upload_file(
             path_or_fileobj=df.to_parquet(index=False),
             path_in_repo=f"{VOTES_LOG_SUBDIR}/{shard}",
             repo_id=VOTES_REPO_ID,
             repo_type=VOTES_REPO_TYPE,
-            commit_message=commit_message,
         )
     def _flush_loop(self) -> None:
         while True:
             with self._flush_condition:
                 while True:
                     if self._shutdown:
-                        # Flush last batch of votes.
                         if self._votes_buffer:
                             break
@@ -89,16 +103,21 @@ class VoteStorage:
                 batch = self._votes_buffer
                 self._votes_buffer = []
-            assert batch
-            batch_df = pd.DataFrame(batch)
-            del batch
-            for col in VOTE_COLUMNS:
-                if col not in batch_df.columns:
-                    batch_df[col] = None
-            batch_df = batch_df[VOTE_COLUMNS]
-            self._upload_votes_batch(batch_df, commit_message=f"upload {len(batch_df)} vote rows")
     def close(self) -> None:
         with self._flush_condition:

 class VoteStorage:
+    def __init__(self, mode: str, token: str | None = None) -> None:
         assert mode in ("hf", "void"), f"Unsupported storage mode: {mode}"
         self.mode = mode
         is_debug_mode = self.mode == "void"
         self._flush_every = 3 if is_debug_mode else 50
         self._flush_interval_sec = 15.0 if is_debug_mode else 300.0
+        self._hf_api = HfApi(token=token)
+        self._flush_condition = threading.Condition(threading.Lock())
+        self._sync_event = threading.Event()
+        self._sync_lock = threading.Lock()
         self._votes_buffer: list[dict] = []
+        self._shutdown = False
         self._flush_thread = threading.Thread(target=self._flush_loop, daemon=True)
         self._flush_thread.start()
         atexit.register(self.close)
+    def _upload_votes_batch(self, batch: list[dict]) -> None:
+        assert batch
         if self.mode == "void":
             return
+        df = pd.DataFrame(batch)
+        for col in VOTE_COLUMNS:
+            if col not in df.columns:
+                df[col] = None
+        df = df[VOTE_COLUMNS]
         ts = int(time.time())
         shard = f"votes_{ts}_{uuid.uuid4().hex}.parquet"
+        self._hf_api.upload_file(
             path_or_fileobj=df.to_parquet(index=False),
             path_in_repo=f"{VOTES_LOG_SUBDIR}/{shard}",
             repo_id=VOTES_REPO_ID,
             repo_type=VOTES_REPO_TYPE,
+            commit_message=f"upload {len(df)} vote rows",
         )
     def _flush_loop(self) -> None:
         while True:
             with self._flush_condition:
                 while True:
+                    # Forced sync.
+                    if not self._sync_event.is_set():
+                        if self._votes_buffer:
+                            break
+                        self._sync_event.set()
+                    # Shutdown wanted.
                     if self._shutdown:
                         if self._votes_buffer:
                             break
                 batch = self._votes_buffer
                 self._votes_buffer = []
+            self._upload_votes_batch(batch)
+    def sync(self) -> None:
+        with self._sync_lock:
+            with self._flush_condition:
+                is_shutdown = self._shutdown
+                if not is_shutdown:
+                    self._sync_event.clear()
+                    self._flush_condition.notify()
+            if not is_shutdown:
+                self._sync_event.wait()
+        if is_shutdown:
+            self._flush_thread.join()
     def close(self) -> None:
         with self._flush_condition: