RedHotTensors commited on
Commit
d1ad299
·
1 Parent(s): b14aefa

Overhaul pairing algorithm to avoid degenerate cases.

Browse files
Files changed (1) hide show
  1. app.py +73 -57
app.py CHANGED
@@ -7,6 +7,8 @@ import os
7
  import html
8
  import sys
9
 
 
 
10
  import pandas as pd
11
  from huggingface_hub import hf_hub_download
12
 
@@ -100,57 +102,64 @@ def _stats_reloader() -> None:
100
  _load_stats()
101
  threading.Thread(target=_stats_reloader, daemon=True).start()
102
 
103
- def _pick_from(df: pd.DataFrame, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
104
  if len(df) < 2:
105
  return None
106
 
107
- remaining = len(df) - 2
108
-
109
  sample = df.sample(2, weights=weights, replace=False)
110
- return sample.iloc[0], sample.iloc[1], remaining
111
 
112
  def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
113
  if len(df) < 2:
114
  return None
115
 
116
- least = df[field].min()
 
 
117
  if least >= max_bin:
118
  return None
119
 
120
- remaining = (df[field] < max_bin).sum() - 1
121
 
122
- candidates = df[df[field] == least]
123
  if len(candidates) > 1:
124
  sample = candidates.sample(2, replace=False)
125
- return sample.iloc[0], sample.iloc[1], remaining
126
 
127
  first = candidates.iloc[0]
128
  while True:
129
  least += 1
130
- candidates = df[df[field] == least]
131
  if not candidates.empty:
132
- return first, candidates.sample().iloc[0], remaining
133
 
134
  def _pick_from_similar(
135
  df: pd.DataFrame,
136
  field: str,
137
  *,
 
138
  other_df: pd.DataFrame | None = None,
139
- weights: pd.Series | None = None,
140
  start: int | float | None = None,
141
  step: int | float = 1,
142
  ) -> tuple[pd.Series, pd.Series, int] | None:
143
  if len(df) < 2:
144
  return None
145
 
146
- other_weights: pd.Series | None = None
147
  if other_df is None:
148
  other_df = df
149
- other_weights = weights
 
150
 
151
- remaining = len(df) - 1
 
 
152
 
153
- first = df.sample(weights=weights).iloc[0]
 
 
 
 
154
  if start is None:
155
  start = first[field]
156
 
@@ -167,57 +176,64 @@ def _pick_from_similar(
167
  break
168
 
169
  while True:
170
- other = candidates.sample(weights=other_weights).iloc[0]
171
  if other["md5"] != first["md5"]:
172
- return first, other, remaining
173
 
174
  def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
175
  gdf = _pool_df[_pool_df["group"] == group]
176
-
177
  voted = gdf[gdf["votes"] > 0]
178
- nontied = voted[voted["ties"] == 0]
179
- nonlosers = voted[voted["losses"] < 2]
180
- initializing = len(nontied) < 20
181
-
182
- if not initializing:
183
- # 1) Pair images with wins-only records.
184
- picked = _pick_from_bins(nontied[nontied["losses"] == 0], "wins", 4)
185
- if picked is not None:
186
- return *picked, "wins-only"
187
-
188
- # 2) Pair images with loss-only records.
189
- picked = _pick_from_bins(nontied[nontied["wins"] == 0], "losses", 3)
190
- if picked is not None:
191
- return *picked, "losses-only"
192
 
193
- # 3) Ensure a minimum density of 3 among non-losers.
194
- picked = _pick_from(nonlosers[nonlosers["votes"] < 3])
 
195
  if picked is not None:
196
- return *picked, "sparse"
197
-
198
- # 4) Introduce a new image.
199
- if initializing or random.random() < 0.67:
200
- unvoted = gdf[gdf["votes"] == 0]
201
-
202
- if nontied.empty: # Initial ranking.
203
- picked = _pick_from(unvoted)
204
- if picked is not None:
205
- return *picked, "init"
206
- elif not unvoted.empty:
207
- picked = _pick_from_similar(unvoted, "winrate", other_df=voted, start=0.5, step=0.1)
208
- if picked is not None:
209
- return *picked, "new"
210
-
211
- #) Occasionally sample higher-quality
212
- if random.random() < 0.1:
213
- picked = _pick_from_similar(nonlosers[nonlosers["wins"] >= 5], "wins")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  if picked is not None:
215
- return *picked, "probe-best"
216
-
217
- # 5) Vote-weighted random sampling between similar win-rates.
218
- picked = _pick_from_similar(voted, "winrate", weights=(1.0 / voted["votes"]), step=0.1)
 
 
 
 
 
219
  assert picked is not None
220
- return *picked, "random"
221
 
222
  def _row_image_url(row) -> str:
223
  sample_url = row.get("sample_url")
 
7
  import html
8
  import sys
9
 
10
+ from typing import Callable
11
+
12
  import pandas as pd
13
  from huggingface_hub import hf_hub_download
14
 
 
102
  _load_stats()
103
  threading.Thread(target=_stats_reloader, daemon=True).start()
104
 
105
+ def _pick_from(df: pd.DataFrame, *, weights: pd.Series | None = None) -> tuple[pd.Series, pd.Series, int] | None:
106
  if len(df) < 2:
107
  return None
108
 
 
 
109
  sample = df.sample(2, weights=weights, replace=False)
110
+ return sample.iloc[0], sample.iloc[1], len(df)
111
 
112
  def _pick_from_bins(df: pd.DataFrame, field: str, max_bin: int) -> tuple[pd.Series, pd.Series, int] | None:
113
  if len(df) < 2:
114
  return None
115
 
116
+ values = df[field]
117
+
118
+ least = values.min()
119
  if least >= max_bin:
120
  return None
121
 
122
+ eligible = (values < max_bin).sum()
123
 
124
+ candidates = df[values == least]
125
  if len(candidates) > 1:
126
  sample = candidates.sample(2, replace=False)
127
+ return sample.iloc[0], sample.iloc[1], eligible
128
 
129
  first = candidates.iloc[0]
130
  while True:
131
  least += 1
132
+ candidates = df[values == least]
133
  if not candidates.empty:
134
+ return first, candidates.sample().iloc[0], eligible
135
 
136
  def _pick_from_similar(
137
  df: pd.DataFrame,
138
  field: str,
139
  *,
140
+ weights: Callable[[pd.DataFrame], pd.Series] | None = None,
141
  other_df: pd.DataFrame | None = None,
142
+ other_weights: Callable[[pd.DataFrame], pd.Series] | None = None,
143
  start: int | float | None = None,
144
  step: int | float = 1,
145
  ) -> tuple[pd.Series, pd.Series, int] | None:
146
  if len(df) < 2:
147
  return None
148
 
 
149
  if other_df is None:
150
  other_df = df
151
+ elif len(other_df) < 2:
152
+ return None
153
 
154
+ weight_vals: pd.Series | None = None
155
+ if weights is not None:
156
+ weight_vals = weights(df)
157
 
158
+ other_weight_vals: pd.Series | None = None
159
+ if other_weights is not None:
160
+ other_weight_vals = other_weights(other_df)
161
+
162
+ first = df.sample(weights=weight_vals).iloc[0]
163
  if start is None:
164
  start = first[field]
165
 
 
176
  break
177
 
178
  while True:
179
+ other = candidates.sample(weights=other_weight_vals).iloc[0]
180
  if other["md5"] != first["md5"]:
181
+ return first, other, len(df)
182
 
183
  def _pool_fetch_pair(group: str) -> tuple[pd.Series, pd.Series, int, str]:
184
  gdf = _pool_df[_pool_df["group"] == group]
 
185
  voted = gdf[gdf["votes"] > 0]
186
+ unvoted = gdf[gdf["votes"] == 0]
187
+ tamo = voted[voted["ties"] < 2]
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ # Initialize with 8 random non-tied pairs.
190
+ if len(tamo) < 8:
191
+ picked = _pick_from(unvoted)
192
  if picked is not None:
193
+ return *picked, "init"
194
+
195
+ # Pair first-time winners.
196
+ picked = _pick_from(tamo[(tamo["wins"] == 1) & (tamo["losses"] == 0)])
197
+ if picked is not None:
198
+ breakpoint()
199
+ return *picked, "new-winners"
200
+
201
+ # Pair first-time losers.
202
+ picked = _pick_from(tamo[(tamo["wins"] == 0) & (tamo["losses"] == 1)])
203
+ if picked is not None:
204
+ return *picked, "new-losers"
205
+
206
+ # Link cliques to main network and break ties.
207
+ nonties = voted["votes"] - voted["ties"]
208
+ picked = _pick_from_similar(
209
+ voted[nonties < 3], "winrate",
210
+ other_df=voted[nonties > 3],
211
+ other_weights=lambda df: 1.0 / df["votes"],
212
+ step=0.1
213
+ )
214
+ if picked is not None:
215
+ return *picked, "sparse"
216
+
217
+ # Introduce a new image.
218
+ if random.random() < 0.5:
219
+ picked = _pick_from_similar(
220
+ unvoted, "winrate",
221
+ other_df=voted,
222
+ other_weights=lambda df: 1.0 / df["votes"],
223
+ start=0.5, step=0.1
224
+ )
225
  if picked is not None:
226
+ return *picked, "new"
227
+
228
+ # Vote-weighted random sampling between similar winrates, slighlty biased against picking losers.
229
+ picked = _pick_from_similar(
230
+ voted, "winrate",
231
+ weights=lambda df: 1.0 / (df["votes"] + 0.2 * df["losses"]),
232
+ other_weights=lambda df: 1.0 / df["votes"],
233
+ step=0.1
234
+ )
235
  assert picked is not None
236
+ return *picked, "fair-probe"
237
 
238
  def _row_image_url(row) -> str:
239
  sample_url = row.get("sample_url")