RedHotTensors commited on
Commit
e2b56f8
·
1 Parent(s): 2e679fc

Improve diversity by using weighted sampling instead of discrete bands.

Browse files
Files changed (1) hide show
  1. app.py +26 -69
app.py CHANGED
@@ -109,39 +109,12 @@ def _pick_from(df: pd.DataFrame, *, weights: pd.Series | None = None) -> tuple[p
109
  sample = df.sample(2, weights=weights, replace=False)
110
  return sample.iloc[0], sample.iloc[1], len(df)
111
 
112
- def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
113
- if len(df) < 2:
114
- return None
115
-
116
- values = df[field]
117
-
118
- least = values.min()
119
- if least >= max_bin:
120
- return None
121
-
122
- eligible = (values < max_bin).sum()
123
-
124
- candidates = df[values == least]
125
- if len(candidates) > 1:
126
- sample = candidates.sample(2, replace=False)
127
- return sample.iloc[0], sample.iloc[1], eligible
128
-
129
- first = candidates.iloc[0]
130
- while True:
131
- least += 1
132
- candidates = df[values == least]
133
- if not candidates.empty:
134
- return first, candidates.sample().iloc[0], eligible
135
-
136
- def _pick_from_similar(
137
  df: pd.DataFrame,
138
- field: str,
139
  *,
140
  weights: Callable[[pd.DataFrame], pd.Series] | None = None,
141
  other_df: pd.DataFrame | None = None,
142
- other_weights: Callable[[pd.DataFrame], pd.Series] | None = None,
143
- start: int | float | None = None,
144
- step: int | float = 1,
145
  ) -> tuple[pd.Series, pd.Series, int] | None:
146
  if len(df) < 2:
147
  return None
@@ -155,28 +128,11 @@ def _pick_from_similar(
155
  if weights is not None:
156
  weight_vals = weights(df)
157
 
158
- other_weight_vals: pd.Series | None = None
159
- if other_weights is not None:
160
- other_weight_vals = other_weights(other_df)
161
-
162
  first = df.sample(weights=weight_vals).iloc[0]
163
- if start is None:
164
- start = first[field]
165
-
166
- upper = start
167
- lower = start
168
-
169
- values = other_df[field]
170
- while True:
171
- upper += step
172
- lower -= step
173
-
174
- candidates = other_df[(values >= lower) & (values <= upper)]
175
- if len(candidates) > 1:
176
- break
177
 
178
  while True:
179
- other = candidates.sample(weights=other_weight_vals).iloc[0]
180
  if other["md5"] != first["md5"]:
181
  return first, other, len(df)
182
 
@@ -195,38 +151,39 @@ def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
195
  if picked is not None:
196
  return *picked, "new-losers"
197
 
 
 
 
 
 
 
198
  # Link cliques to main network and break ties.
199
  nonties = votes - voted["ties"]
200
- picked = _pick_from_similar(
201
- voted[(nonties == 0) | (votes == 2)], "winrate",
 
202
  other_df=voted[nonties > 3],
203
- other_weights=lambda df: 1.0 / df["votes"],
204
- step=0.1
205
  )
206
  if picked is not None:
207
  return *picked, "sparse"
208
 
209
  # Introduce new images.
210
- if len(voted) < 8 or random.random() < 0.5:
211
  unvoted = gdf[gdf["votes"] == 0]
212
- picked = _pick_from(unvoted)
213
- if picked is None:
214
- picked = _pick_from_similar(
215
- unvoted, "winrate",
216
- other_df=voted,
217
- other_weights=lambda df: 1.0 / df["votes"],
218
- start=0.5, step=0.1
219
- )
220
-
221
- if picked is not None:
222
- return *picked, "new"
223
 
224
  # Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
225
- picked = _pick_from_similar(
226
- voted, "winrate",
227
- weights=lambda df: 1.0 / (df["votes"] + 0.2 * df["losses"]),
228
- other_weights=lambda df: 1.0 / df["votes"],
229
- step=0.1
230
  )
231
  assert picked is not None
232
  return *picked, "fair-probe"
 
109
  sample = df.sample(2, weights=weights, replace=False)
110
  return sample.iloc[0], sample.iloc[1], len(df)
111
 
112
+ def _pick_similar(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  df: pd.DataFrame,
114
+ distance: Callable[[pd.DataFrame, pd.Series], pd.Series],
115
  *,
116
  weights: Callable[[pd.DataFrame], pd.Series] | None = None,
117
  other_df: pd.DataFrame | None = None,
 
 
 
118
  ) -> tuple[pd.Series, pd.Series, int] | None:
119
  if len(df) < 2:
120
  return None
 
128
  if weights is not None:
129
  weight_vals = weights(df)
130
 
 
 
 
 
131
  first = df.sample(weights=weight_vals).iloc[0]
132
+ weight_vals = 1.0 / (1.0 + distance(other_df, first))
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  while True:
135
+ other = other_df.sample(weights=weight_vals).iloc[0]
136
  if other["md5"] != first["md5"]:
137
  return first, other, len(df)
138
 
 
151
  if picked is not None:
152
  return *picked, "new-losers"
153
 
154
+ def record_distance(df: pd.DataFrame, pivot: pd.Series) -> pd.Series:
155
+ return (
156
+ (df["wins"] - pivot["wins"])**2 +
157
+ (df["losses"] - pivot["losses"])**2
158
+ )**0.75 # L2 is a bit too loose
159
+
160
  # Link cliques to main network and break ties.
161
  nonties = votes - voted["ties"]
162
+ picked = _pick_similar(
163
+ voted[(nonties == 0) | (votes == 2)],
164
+ record_distance,
165
  other_df=voted[nonties > 3],
 
 
166
  )
167
  if picked is not None:
168
  return *picked, "sparse"
169
 
170
  # Introduce new images.
171
+ if len(voted) < 8 or random.random() < 0.33:
172
  unvoted = gdf[gdf["votes"] == 0]
173
+ match len(unvoted):
174
+ case 0:
175
+ pass
176
+ case 1:
177
+ return unvoted.iloc[0], voted.iloc[0], 1, "new"
178
+ case _:
179
+ picked = _pick_from(unvoted)
180
+ assert picked is not None
181
+ return *picked, "new"
 
 
182
 
183
  # Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
184
+ picked = _pick_similar(
185
+ voted, record_distance,
186
+ weights=lambda df: 1.0 / (df["votes"]**1.25 + 0.1 * df["losses"]),
 
 
187
  )
188
  assert picked is not None
189
  return *picked, "fair-probe"