Spaces:

RedRocket
/

e6-visual-ratings

Running

App Files Files Community

RedHotTensors commited on 14 days ago

Commit

e2b56f8

1 Parent(s): 2e679fc

Improve diversity by using weighted sampling instead of discrete bands.

Browse files

Files changed (1) hide show

app.py +26 -69

app.py CHANGED Viewed

@@ -109,39 +109,12 @@ def _pick_from(df: pd.DataFrame, *, weights: pd.Series | None = None) -> tuple[p
     sample = df.sample(2, weights=weights, replace=False)
     return sample.iloc[0], sample.iloc[1], len(df)
-def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
-    if len(df) < 2:
-        return None
-    values = df[field]
-    least = values.min()
-    if least >= max_bin:
-        return None
-    eligible = (values < max_bin).sum()
-    candidates = df[values == least]
-    if len(candidates) > 1:
-        sample = candidates.sample(2, replace=False)
-        return sample.iloc[0], sample.iloc[1], eligible
-    first = candidates.iloc[0]
-    while True:
-        least += 1
-        candidates = df[values == least]
-        if not candidates.empty:
-            return first, candidates.sample().iloc[0], eligible
-def _pick_from_similar(
     df: pd.DataFrame,
-    field: str,
     *,
     weights: Callable[[pd.DataFrame], pd.Series] | None = None,
     other_df: pd.DataFrame | None = None,
-    other_weights: Callable[[pd.DataFrame], pd.Series] | None = None,
-    start: int | float | None = None,
-    step: int | float = 1,
 ) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
@@ -155,28 +128,11 @@ def _pick_from_similar(
     if weights is not None:
         weight_vals = weights(df)
-    other_weight_vals: pd.Series | None = None
-    if other_weights is not None:
-        other_weight_vals = other_weights(other_df)
     first = df.sample(weights=weight_vals).iloc[0]
-    if start is None:
-        start = first[field]
-    upper = start
-    lower = start
-    values = other_df[field]
-    while True:
-        upper += step
-        lower -= step
-        candidates = other_df[(values >= lower) & (values <= upper)]
-        if len(candidates) > 1:
-            break
     while True:
-        other = candidates.sample(weights=other_weight_vals).iloc[0]
         if other["md5"] != first["md5"]:
             return first, other, len(df)
@@ -195,38 +151,39 @@ def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
     if picked is not None:
         return *picked, "new-losers"
     # Link cliques to main network and break ties.
     nonties = votes - voted["ties"]
-    picked = _pick_from_similar(
-        voted[(nonties == 0) | (votes == 2)], "winrate",
         other_df=voted[nonties > 3],
-        other_weights=lambda df: 1.0 / df["votes"],
-        step=0.1
     )
     if picked is not None:
         return *picked, "sparse"
     # Introduce new images.
-    if len(voted) < 8 or random.random() < 0.5:
         unvoted = gdf[gdf["votes"] == 0]
-        picked = _pick_from(unvoted)
-        if picked is None:
-            picked = _pick_from_similar(
-                unvoted, "winrate",
-                other_df=voted,
-                other_weights=lambda df: 1.0 / df["votes"],
-                start=0.5, step=0.1
-            )
-        if picked is not None:
-            return *picked, "new"
     # Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
-    picked = _pick_from_similar(
-        voted, "winrate",
-        weights=lambda df: 1.0 / (df["votes"] + 0.2 * df["losses"]),
-        other_weights=lambda df: 1.0 / df["votes"],
-        step=0.1
     )
     assert picked is not None
     return *picked, "fair-probe"

     sample = df.sample(2, weights=weights, replace=False)
     return sample.iloc[0], sample.iloc[1], len(df)
+def _pick_similar(
     df: pd.DataFrame,
+    distance: Callable[[pd.DataFrame, pd.Series], pd.Series],
     *,
     weights: Callable[[pd.DataFrame], pd.Series] | None = None,
     other_df: pd.DataFrame | None = None,
 ) -> tuple[pd.Series, pd.Series, int] | None:
     if len(df) < 2:
         return None
     if weights is not None:
         weight_vals = weights(df)
     first = df.sample(weights=weight_vals).iloc[0]
+    weight_vals = 1.0 / (1.0 + distance(other_df, first))
     while True:
+        other = other_df.sample(weights=weight_vals).iloc[0]
         if other["md5"] != first["md5"]:
             return first, other, len(df)
     if picked is not None:
         return *picked, "new-losers"
+    def record_distance(df: pd.DataFrame, pivot: pd.Series) -> pd.Series:
+        return (
+            (df["wins"] - pivot["wins"])**2 +
+            (df["losses"] - pivot["losses"])**2
+        )**0.75 # L2 is a bit too loose
     # Link cliques to main network and break ties.
     nonties = votes - voted["ties"]
+    picked = _pick_similar(
+        voted[(nonties == 0) | (votes == 2)],
+        record_distance,
         other_df=voted[nonties > 3],
     )
     if picked is not None:
         return *picked, "sparse"
     # Introduce new images.
+    if len(voted) < 8 or random.random() < 0.33:
         unvoted = gdf[gdf["votes"] == 0]
+        match len(unvoted):
+            case 0:
+                pass
+            case 1:
+                return unvoted.iloc[0], voted.iloc[0], 1, "new"
+            case _:
+                picked = _pick_from(unvoted)
+                assert picked is not None
+                return *picked, "new"
     # Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
+    picked = _pick_similar(
+        voted, record_distance,
+        weights=lambda df: 1.0 / (df["votes"]**1.25 + 0.1 * df["losses"]),
     )
     assert picked is not None
     return *picked, "fair-probe"