Spaces:
Sleeping
Sleeping
Commit ·
d1ad299
1
Parent(s): b14aefa
Overhaul pairing algorithm to avoid degenerate cases.
Browse files
app.py
CHANGED
|
@@ -7,6 +7,8 @@ import os
|
|
| 7 |
import html
|
| 8 |
import sys
|
| 9 |
|
|
|
|
|
|
|
| 10 |
import pandas as pd
|
| 11 |
from huggingface_hub import hf_hub_download
|
| 12 |
|
|
@@ -100,57 +102,64 @@ def _stats_reloader() -> None:
|
|
| 100 |
_load_stats()
|
| 101 |
threading.Thread(target=_stats_reloader, daemon=True).start()
|
| 102 |
|
| 103 |
-
def _pick_from(df: pd.DataFrame, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
|
| 104 |
if len(df) < 2:
|
| 105 |
return None
|
| 106 |
|
| 107 |
-
remaining = len(df) - 2
|
| 108 |
-
|
| 109 |
sample = df.sample(2, weights=weights, replace=False)
|
| 110 |
-
return sample.iloc[0], sample.iloc[1],
|
| 111 |
|
| 112 |
def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
|
| 113 |
if len(df) < 2:
|
| 114 |
return None
|
| 115 |
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
if least >= max_bin:
|
| 118 |
return None
|
| 119 |
|
| 120 |
-
|
| 121 |
|
| 122 |
-
candidates = df[
|
| 123 |
if len(candidates) > 1:
|
| 124 |
sample = candidates.sample(2, replace=False)
|
| 125 |
-
return sample.iloc[0], sample.iloc[1],
|
| 126 |
|
| 127 |
first = candidates.iloc[0]
|
| 128 |
while True:
|
| 129 |
least += 1
|
| 130 |
-
candidates = df[
|
| 131 |
if not candidates.empty:
|
| 132 |
-
return first, candidates.sample().iloc[0],
|
| 133 |
|
| 134 |
def _pick_from_similar(
|
| 135 |
df: pd.DataFrame,
|
| 136 |
field: str,
|
| 137 |
*,
|
|
|
|
| 138 |
other_df: pd.DataFrame | None = None,
|
| 139 |
-
|
| 140 |
start: int | float | None = None,
|
| 141 |
step: int | float = 1,
|
| 142 |
) -> tuple[pd.Series, pd.Series, int] | None:
|
| 143 |
if len(df) < 2:
|
| 144 |
return None
|
| 145 |
|
| 146 |
-
other_weights: pd.Series | None = None
|
| 147 |
if other_df is None:
|
| 148 |
other_df = df
|
| 149 |
-
|
|
|
|
| 150 |
|
| 151 |
-
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
if start is None:
|
| 155 |
start = first[field]
|
| 156 |
|
|
@@ -167,57 +176,64 @@ def _pick_from_similar(
|
|
| 167 |
break
|
| 168 |
|
| 169 |
while True:
|
| 170 |
-
other = candidates.sample(weights=
|
| 171 |
if other["md5"] != first["md5"]:
|
| 172 |
-
return first, other,
|
| 173 |
|
| 174 |
def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
|
| 175 |
gdf = _pool_df[_pool_df["group"] == group]
|
| 176 |
-
|
| 177 |
voted = gdf[gdf["votes"] > 0]
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
initializing = len(nontied) < 20
|
| 181 |
-
|
| 182 |
-
if not initializing:
|
| 183 |
-
# 1) Pair images with wins-only records.
|
| 184 |
-
picked = _pick_from_bins(nontied[nontied["losses"] == 0], "wins", 4)
|
| 185 |
-
if picked is not None:
|
| 186 |
-
return *picked, "wins-only"
|
| 187 |
-
|
| 188 |
-
# 2) Pair images with loss-only records.
|
| 189 |
-
picked = _pick_from_bins(nontied[nontied["wins"] == 0], "losses", 3)
|
| 190 |
-
if picked is not None:
|
| 191 |
-
return *picked, "losses-only"
|
| 192 |
|
| 193 |
-
|
| 194 |
-
|
|
|
|
| 195 |
if picked is not None:
|
| 196 |
-
return *picked, "
|
| 197 |
-
|
| 198 |
-
#
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
if picked is not None:
|
| 215 |
-
return *picked, "
|
| 216 |
-
|
| 217 |
-
#
|
| 218 |
-
picked = _pick_from_similar(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
assert picked is not None
|
| 220 |
-
return *picked, "
|
| 221 |
|
| 222 |
def _row_image_url(row) -> str:
|
| 223 |
sample_url = row.get("sample_url")
|
|
|
|
| 7 |
import html
|
| 8 |
import sys
|
| 9 |
|
| 10 |
+
from typing import Callable
|
| 11 |
+
|
| 12 |
import pandas as pd
|
| 13 |
from huggingface_hub import hf_hub_download
|
| 14 |
|
|
|
|
| 102 |
_load_stats()
|
| 103 |
threading.Thread(target=_stats_reloader, daemon=True).start()
|
| 104 |
|
| 105 |
+
def _pick_from(df: pd.DataFrame, *, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
|
| 106 |
if len(df) < 2:
|
| 107 |
return None
|
| 108 |
|
|
|
|
|
|
|
| 109 |
sample = df.sample(2, weights=weights, replace=False)
|
| 110 |
+
return sample.iloc[0], sample.iloc[1], len(df)
|
| 111 |
|
| 112 |
def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
|
| 113 |
if len(df) < 2:
|
| 114 |
return None
|
| 115 |
|
| 116 |
+
values = df[field]
|
| 117 |
+
|
| 118 |
+
least = values.min()
|
| 119 |
if least >= max_bin:
|
| 120 |
return None
|
| 121 |
|
| 122 |
+
eligible = (values < max_bin).sum()
|
| 123 |
|
| 124 |
+
candidates = df[values == least]
|
| 125 |
if len(candidates) > 1:
|
| 126 |
sample = candidates.sample(2, replace=False)
|
| 127 |
+
return sample.iloc[0], sample.iloc[1], eligible
|
| 128 |
|
| 129 |
first = candidates.iloc[0]
|
| 130 |
while True:
|
| 131 |
least += 1
|
| 132 |
+
candidates = df[values == least]
|
| 133 |
if not candidates.empty:
|
| 134 |
+
return first, candidates.sample().iloc[0], eligible
|
| 135 |
|
| 136 |
def _pick_from_similar(
|
| 137 |
df: pd.DataFrame,
|
| 138 |
field: str,
|
| 139 |
*,
|
| 140 |
+
weights: Callable[[pd.DataFrame], pd.Series] | None = None,
|
| 141 |
other_df: pd.DataFrame | None = None,
|
| 142 |
+
other_weights: Callable[[pd.DataFrame], pd.Series] | None = None,
|
| 143 |
start: int | float | None = None,
|
| 144 |
step: int | float = 1,
|
| 145 |
) -> tuple[pd.Series, pd.Series, int] | None:
|
| 146 |
if len(df) < 2:
|
| 147 |
return None
|
| 148 |
|
|
|
|
| 149 |
if other_df is None:
|
| 150 |
other_df = df
|
| 151 |
+
elif len(other_df) < 2:
|
| 152 |
+
return None
|
| 153 |
|
| 154 |
+
weight_vals: pd.Series | None = None
|
| 155 |
+
if weights is not None:
|
| 156 |
+
weight_vals = weights(df)
|
| 157 |
|
| 158 |
+
other_weight_vals: pd.Series | None = None
|
| 159 |
+
if other_weights is not None:
|
| 160 |
+
other_weight_vals = other_weights(other_df)
|
| 161 |
+
|
| 162 |
+
first = df.sample(weights=weight_vals).iloc[0]
|
| 163 |
if start is None:
|
| 164 |
start = first[field]
|
| 165 |
|
|
|
|
| 176 |
break
|
| 177 |
|
| 178 |
while True:
|
| 179 |
+
other = candidates.sample(weights=other_weight_vals).iloc[0]
|
| 180 |
if other["md5"] != first["md5"]:
|
| 181 |
+
return first, other, len(df)
|
| 182 |
|
| 183 |
def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
|
| 184 |
gdf = _pool_df[_pool_df["group"] == group]
|
|
|
|
| 185 |
voted = gdf[gdf["votes"] > 0]
|
| 186 |
+
unvoted = gdf[gdf["votes"] == 0]
|
| 187 |
+
tamo = voted[voted["ties"] < 2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
+
# Initialize with 8 random non-tied pairs.
|
| 190 |
+
if len(tamo) < 8:
|
| 191 |
+
picked = _pick_from(unvoted)
|
| 192 |
if picked is not None:
|
| 193 |
+
return *picked, "init"
|
| 194 |
+
|
| 195 |
+
# Pair first-time winners.
|
| 196 |
+
picked = _pick_from(tamo[(tamo["wins"] == 1) & (tamo["losses"] == 0)])
|
| 197 |
+
if picked is not None:
|
| 198 |
+
breakpoint()
|
| 199 |
+
return *picked, "new-winners"
|
| 200 |
+
|
| 201 |
+
# Pair first-time losers.
|
| 202 |
+
picked = _pick_from(tamo[(tamo["wins"] == 0) & (tamo["losses"] == 1)])
|
| 203 |
+
if picked is not None:
|
| 204 |
+
return *picked, "new-losers"
|
| 205 |
+
|
| 206 |
+
# Link cliques to main network and break ties.
|
| 207 |
+
nonties = voted["votes"] - voted["ties"]
|
| 208 |
+
picked = _pick_from_similar(
|
| 209 |
+
voted[nonties < 3], "winrate",
|
| 210 |
+
other_df=voted[nonties > 3],
|
| 211 |
+
other_weights=lambda df: 1.0 / df["votes"],
|
| 212 |
+
step=0.1
|
| 213 |
+
)
|
| 214 |
+
if picked is not None:
|
| 215 |
+
return *picked, "sparse"
|
| 216 |
+
|
| 217 |
+
# Introduce a new image.
|
| 218 |
+
if random.random() < 0.5:
|
| 219 |
+
picked = _pick_from_similar(
|
| 220 |
+
unvoted, "winrate",
|
| 221 |
+
other_df=voted,
|
| 222 |
+
other_weights=lambda df: 1.0 / df["votes"],
|
| 223 |
+
start=0.5, step=0.1
|
| 224 |
+
)
|
| 225 |
if picked is not None:
|
| 226 |
+
return *picked, "new"
|
| 227 |
+
|
| 228 |
+
# Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
|
| 229 |
+
picked = _pick_from_similar(
|
| 230 |
+
voted, "winrate",
|
| 231 |
+
weights=lambda df: 1.0 / (df["votes"] + 0.2 * df["losses"]),
|
| 232 |
+
other_weights=lambda df: 1.0 / df["votes"],
|
| 233 |
+
step=0.1
|
| 234 |
+
)
|
| 235 |
assert picked is not None
|
| 236 |
+
return *picked, "fair-probe"
|
| 237 |
|
| 238 |
def _row_image_url(row) -> str:
|
| 239 |
sample_url = row.get("sample_url")
|