Spaces:

RedRocket
/

e6-visual-ratings

Sleeping

App Files Files Community

RedHotTensors commited on Apr 24

Commit

d1ad299

1 Parent(s): b14aefa

Overhaul pairing algorithm to avoid degenerate cases.

Browse files

Files changed (1) hide show

app.py +73 -57

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ import os
 import html
 import sys
 import pandas as pd
 from huggingface_hub import hf_hub_download
@@ -100,57 +102,64 @@ def _stats_reloader() -> None:
 _load_stats()
 threading.Thread(target=_stats_reloader, daemon=True).start()
-def _pick_from(df: pd.DataFrame, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
-    remaining = len(df) - 2
     sample = df.sample(2, weights=weights, replace=False)
-    return sample.iloc[0], sample.iloc[1], remaining
 def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
-    least = df[field].min()
     if least >= max_bin:
         return None
-    remaining = (df[field] < max_bin).sum() - 1
-    candidates = df[df[field] == least]
     if len(candidates) > 1:
         sample = candidates.sample(2, replace=False)
-        return sample.iloc[0], sample.iloc[1], remaining
     first = candidates.iloc[0]
     while True:
         least += 1
-        candidates = df[df[field] == least]
         if not candidates.empty:
-            return first, candidates.sample().iloc[0], remaining
 def _pick_from_similar(
     df: pd.DataFrame,
     field: str,
     *,
     other_df: pd.DataFrame | None = None,
-    weights: pd.Series | None = None,
     start: int | float | None = None,
     step: int | float = 1,
 ) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
-    other_weights: pd.Series | None = None
     if other_df is None:
         other_df = df
-        other_weights = weights
-    remaining = len(df) - 1
-    first = df.sample(weights=weights).iloc[0]
     if start is None:
         start = first[field]
@@ -167,57 +176,64 @@ def _pick_from_similar(
             break
     while True:
-        other = candidates.sample(weights=other_weights).iloc[0]
         if other["md5"] != first["md5"]:
-            return first, other, remaining
 def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
     gdf = _pool_df[_pool_df["group"] == group]
     voted = gdf[gdf["votes"] > 0]
-    nontied = voted[voted["ties"] == 0]
-    nonlosers = voted[voted["losses"] < 2]
-    initializing = len(nontied) < 20
-    if not initializing:
-        # 1) Pair images with wins-only records.
-        picked = _pick_from_bins(nontied[nontied["losses"] == 0], "wins", 4)
-        if picked is not None:
-            return *picked, "wins-only"
-        # 2) Pair images with loss-only records.
-        picked = _pick_from_bins(nontied[nontied["wins"] == 0], "losses", 3)
-        if picked is not None:
-            return *picked, "losses-only"
-        # 3) Ensure a minimum density of 3 among non-losers.
-        picked = _pick_from(nonlosers[nonlosers["votes"] < 3])
         if picked is not None:
-            return *picked, "sparse"
-    # 4) Introduce a new image.
-    if initializing or random.random() < 0.67:
-        unvoted = gdf[gdf["votes"] == 0]
-        if nontied.empty: # Initial ranking.
-            picked = _pick_from(unvoted)
-            if picked is not None:
-                return *picked, "init"
-        elif not unvoted.empty:
-            picked = _pick_from_similar(unvoted, "winrate", other_df=voted, start=0.5, step=0.1)
-            if picked is not None:
-                return *picked, "new"
-    #) Occasionally sample higher-quality
-    if random.random() < 0.1:
-        picked = _pick_from_similar(nonlosers[nonlosers["wins"] >= 5], "wins")
         if picked is not None:
-            return *picked, "probe-best"
-    # 5) Vote-weighted random sampling between similar win-rates.
-    picked = _pick_from_similar(voted, "winrate", weights=(1.0 / voted["votes"]), step=0.1)
     assert picked is not None
-    return *picked, "random"
 def _row_image_url(row) -> str:
     sample_url = row.get("sample_url")

 import html
 import sys
+from typing import Callable
 import pandas as pd
 from huggingface_hub import hf_hub_download
 _load_stats()
 threading.Thread(target=_stats_reloader, daemon=True).start()
+def _pick_from(df: pd.DataFrame, *, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
     sample = df.sample(2, weights=weights, replace=False)
+    return sample.iloc[0], sample.iloc[1], len(df)
 def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
+    values = df[field]
+    least = values.min()
     if least >= max_bin:
         return None
+    eligible = (values < max_bin).sum()
+    candidates = df[values == least]
     if len(candidates) > 1:
         sample = candidates.sample(2, replace=False)
+        return sample.iloc[0], sample.iloc[1], eligible
     first = candidates.iloc[0]
     while True:
         least += 1
+        candidates = df[values == least]
         if not candidates.empty:
+            return first, candidates.sample().iloc[0], eligible
 def _pick_from_similar(
     df: pd.DataFrame,
     field: str,
     *,
+    weights: Callable[[pd.DataFrame], pd.Series] | None = None,
     other_df: pd.DataFrame | None = None,
+    other_weights: Callable[[pd.DataFrame], pd.Series] | None = None,
     start: int | float | None = None,
     step: int | float = 1,
 ) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
     if other_df is None:
         other_df = df
+    elif len(other_df) < 2:
+        return None
+    weight_vals: pd.Series | None = None
+    if weights is not None:
+        weight_vals = weights(df)
+    other_weight_vals: pd.Series | None = None
+    if other_weights is not None:
+        other_weight_vals = other_weights(other_df)
+    first = df.sample(weights=weight_vals).iloc[0]
     if start is None:
         start = first[field]
             break
     while True:
+        other = candidates.sample(weights=other_weight_vals).iloc[0]
         if other["md5"] != first["md5"]:
+            return first, other, len(df)
 def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
     gdf = _pool_df[_pool_df["group"] == group]
     voted = gdf[gdf["votes"] > 0]
+    unvoted = gdf[gdf["votes"] == 0]
+    tamo = voted[voted["ties"] < 2]
+    # Initialize with 8 random non-tied pairs.
+    if len(tamo) < 8:
+        picked = _pick_from(unvoted)
         if picked is not None:
+            return *picked, "init"
+    # Pair first-time winners.
+    picked = _pick_from(tamo[(tamo["wins"] == 1) & (tamo["losses"] == 0)])
+    if picked is not None:
+        breakpoint()
+        return *picked, "new-winners"
+    # Pair first-time losers.
+    picked = _pick_from(tamo[(tamo["wins"] == 0) & (tamo["losses"] == 1)])
+    if picked is not None:
+        return *picked, "new-losers"
+    # Link cliques to main network and break ties.
+    nonties = voted["votes"] - voted["ties"]
+    picked = _pick_from_similar(
+        voted[nonties < 3], "winrate",
+        other_df=voted[nonties > 3],
+        other_weights=lambda df: 1.0 / df["votes"],
+        step=0.1
+    )
+    if picked is not None:
+        return *picked, "sparse"
+    # Introduce a new image.
+    if random.random() < 0.5:
+        picked = _pick_from_similar(
+            unvoted, "winrate",
+            other_df=voted,
+            other_weights=lambda df: 1.0 / df["votes"],
+            start=0.5, step=0.1
+        )
         if picked is not None:
+            return *picked, "new"
+    # Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
+    picked = _pick_from_similar(
+        voted, "winrate",
+        weights=lambda df: 1.0 / (df["votes"] + 0.2 * df["losses"]),
+        other_weights=lambda df: 1.0 / df["votes"],
+        step=0.1
+    )
     assert picked is not None
+    return *picked, "fair-probe"
 def _row_image_url(row) -> str:
     sample_url = row.get("sample_url")