Spaces:
Running
Running
Commit ·
e2b56f8
1
Parent(s): 2e679fc
Improve diversity by using weighted sampling instead of discrete bands.
Browse files
app.py
CHANGED
|
@@ -109,39 +109,12 @@ def _pick_from(df: pd.DataFrame, *, weights: pd.Series | None = None) -> tuple[p
|
|
| 109 |
sample = df.sample(2, weights=weights, replace=False)
|
| 110 |
return sample.iloc[0], sample.iloc[1], len(df)
|
| 111 |
|
| 112 |
-
def
|
| 113 |
-
if len(df) < 2:
|
| 114 |
-
return None
|
| 115 |
-
|
| 116 |
-
values = df[field]
|
| 117 |
-
|
| 118 |
-
least = values.min()
|
| 119 |
-
if least >= max_bin:
|
| 120 |
-
return None
|
| 121 |
-
|
| 122 |
-
eligible = (values < max_bin).sum()
|
| 123 |
-
|
| 124 |
-
candidates = df[values == least]
|
| 125 |
-
if len(candidates) > 1:
|
| 126 |
-
sample = candidates.sample(2, replace=False)
|
| 127 |
-
return sample.iloc[0], sample.iloc[1], eligible
|
| 128 |
-
|
| 129 |
-
first = candidates.iloc[0]
|
| 130 |
-
while True:
|
| 131 |
-
least += 1
|
| 132 |
-
candidates = df[values == least]
|
| 133 |
-
if not candidates.empty:
|
| 134 |
-
return first, candidates.sample().iloc[0], eligible
|
| 135 |
-
|
| 136 |
-
def _pick_from_similar(
|
| 137 |
df: pd.DataFrame,
|
| 138 |
-
|
| 139 |
*,
|
| 140 |
weights: Callable[[pd.DataFrame], pd.Series] | None = None,
|
| 141 |
other_df: pd.DataFrame | None = None,
|
| 142 |
-
other_weights: Callable[[pd.DataFrame], pd.Series] | None = None,
|
| 143 |
-
start: int | float | None = None,
|
| 144 |
-
step: int | float = 1,
|
| 145 |
) -> tuple[pd.Series, pd.Series, int] | None:
|
| 146 |
if len(df) < 2:
|
| 147 |
return None
|
|
@@ -155,28 +128,11 @@ def _pick_from_similar(
|
|
| 155 |
if weights is not None:
|
| 156 |
weight_vals = weights(df)
|
| 157 |
|
| 158 |
-
other_weight_vals: pd.Series | None = None
|
| 159 |
-
if other_weights is not None:
|
| 160 |
-
other_weight_vals = other_weights(other_df)
|
| 161 |
-
|
| 162 |
first = df.sample(weights=weight_vals).iloc[0]
|
| 163 |
-
|
| 164 |
-
start = first[field]
|
| 165 |
-
|
| 166 |
-
upper = start
|
| 167 |
-
lower = start
|
| 168 |
-
|
| 169 |
-
values = other_df[field]
|
| 170 |
-
while True:
|
| 171 |
-
upper += step
|
| 172 |
-
lower -= step
|
| 173 |
-
|
| 174 |
-
candidates = other_df[(values >= lower) & (values <= upper)]
|
| 175 |
-
if len(candidates) > 1:
|
| 176 |
-
break
|
| 177 |
|
| 178 |
while True:
|
| 179 |
-
other =
|
| 180 |
if other["md5"] != first["md5"]:
|
| 181 |
return first, other, len(df)
|
| 182 |
|
|
@@ -195,38 +151,39 @@ def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
|
|
| 195 |
if picked is not None:
|
| 196 |
return *picked, "new-losers"
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
# Link cliques to main network and break ties.
|
| 199 |
nonties = votes - voted["ties"]
|
| 200 |
-
picked =
|
| 201 |
-
voted[(nonties == 0) | (votes == 2)],
|
|
|
|
| 202 |
other_df=voted[nonties > 3],
|
| 203 |
-
other_weights=lambda df: 1.0 / df["votes"],
|
| 204 |
-
step=0.1
|
| 205 |
)
|
| 206 |
if picked is not None:
|
| 207 |
return *picked, "sparse"
|
| 208 |
|
| 209 |
# Introduce new images.
|
| 210 |
-
if len(voted) < 8 or random.random() < 0.
|
| 211 |
unvoted = gdf[gdf["votes"] == 0]
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
if picked is not None:
|
| 222 |
-
return *picked, "new"
|
| 223 |
|
| 224 |
# Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
|
| 225 |
-
picked =
|
| 226 |
-
voted,
|
| 227 |
-
weights=lambda df: 1.0 / (df["votes"] + 0.
|
| 228 |
-
other_weights=lambda df: 1.0 / df["votes"],
|
| 229 |
-
step=0.1
|
| 230 |
)
|
| 231 |
assert picked is not None
|
| 232 |
return *picked, "fair-probe"
|
|
|
|
| 109 |
sample = df.sample(2, weights=weights, replace=False)
|
| 110 |
return sample.iloc[0], sample.iloc[1], len(df)
|
| 111 |
|
| 112 |
+
def _pick_similar(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
df: pd.DataFrame,
|
| 114 |
+
distance: Callable[[pd.DataFrame, pd.Series], pd.Series],
|
| 115 |
*,
|
| 116 |
weights: Callable[[pd.DataFrame], pd.Series] | None = None,
|
| 117 |
other_df: pd.DataFrame | None = None,
|
|
|
|
|
|
|
|
|
|
| 118 |
) -> tuple[pd.Series, pd.Series, int] | None:
|
| 119 |
if len(df) < 2:
|
| 120 |
return None
|
|
|
|
| 128 |
if weights is not None:
|
| 129 |
weight_vals = weights(df)
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
first = df.sample(weights=weight_vals).iloc[0]
|
| 132 |
+
weight_vals = 1.0 / (1.0 + distance(other_df, first))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
while True:
|
| 135 |
+
other = other_df.sample(weights=weight_vals).iloc[0]
|
| 136 |
if other["md5"] != first["md5"]:
|
| 137 |
return first, other, len(df)
|
| 138 |
|
|
|
|
| 151 |
if picked is not None:
|
| 152 |
return *picked, "new-losers"
|
| 153 |
|
| 154 |
+
def record_distance(df: pd.DataFrame, pivot: pd.Series) -> pd.Series:
|
| 155 |
+
return (
|
| 156 |
+
(df["wins"] - pivot["wins"])**2 +
|
| 157 |
+
(df["losses"] - pivot["losses"])**2
|
| 158 |
+
)**0.75 # L2 is a bit too loose
|
| 159 |
+
|
| 160 |
# Link cliques to main network and break ties.
|
| 161 |
nonties = votes - voted["ties"]
|
| 162 |
+
picked = _pick_similar(
|
| 163 |
+
voted[(nonties == 0) | (votes == 2)],
|
| 164 |
+
record_distance,
|
| 165 |
other_df=voted[nonties > 3],
|
|
|
|
|
|
|
| 166 |
)
|
| 167 |
if picked is not None:
|
| 168 |
return *picked, "sparse"
|
| 169 |
|
| 170 |
# Introduce new images.
|
| 171 |
+
if len(voted) < 8 or random.random() < 0.33:
|
| 172 |
unvoted = gdf[gdf["votes"] == 0]
|
| 173 |
+
match len(unvoted):
|
| 174 |
+
case 0:
|
| 175 |
+
pass
|
| 176 |
+
case 1:
|
| 177 |
+
return unvoted.iloc[0], voted.iloc[0], 1, "new"
|
| 178 |
+
case _:
|
| 179 |
+
picked = _pick_from(unvoted)
|
| 180 |
+
assert picked is not None
|
| 181 |
+
return *picked, "new"
|
|
|
|
|
|
|
| 182 |
|
| 183 |
# Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
|
| 184 |
+
picked = _pick_similar(
|
| 185 |
+
voted, record_distance,
|
| 186 |
+
weights=lambda df: 1.0 / (df["votes"]**1.25 + 0.1 * df["losses"]),
|
|
|
|
|
|
|
| 187 |
)
|
| 188 |
assert picked is not None
|
| 189 |
return *picked, "fair-probe"
|