davanstrien HF Staff Claude Opus 4.6 commited on
Commit
0aea3fd
·
1 Parent(s): dab5e76

Use dynamic percentile-based confidence filter options

Browse files

Replaces hardcoded confidence thresholds (50/90/99%) with percentile-based
options (Top 75%/50%/25%) computed from actual data distribution. This
adapts automatically when the underlying model changes (e.g., v2 → v3).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +20 -9
  2. templates/index.html +5 -4
app.py CHANGED
@@ -138,18 +138,27 @@ def get_categories() -> list[str]:
138
 
139
 
140
  @lru_cache(maxsize=1)
141
- def get_confidence_counts() -> dict[str, int]:
142
- """Count papers at each confidence threshold (for Tufte-style filter).
143
 
144
- Thresholds chosen based on v2 model distribution (avg ~97% confidence).
145
  """
146
  df = get_dataframe()
147
- new_datasets = df.filter(pl.col("is_new_dataset"))
148
- thresholds = [0.5, 0.8, 0.9, 0.95, 0.99]
149
- return {
150
- str(t): new_datasets.filter(pl.col("confidence_score") >= t).height
151
- for t in thresholds
152
- }
 
 
 
 
 
 
 
 
 
153
 
154
 
155
  @lru_cache(maxsize=1)
@@ -371,6 +380,7 @@ async def home(
371
  df = get_dataframe()
372
  categories = get_categories()
373
  histogram_data = get_histogram_data()
 
374
 
375
  # Get stats
376
  total_papers = len(df)
@@ -384,6 +394,7 @@ async def home(
384
  "total_papers": total_papers,
385
  "new_dataset_count": new_dataset_count,
386
  "histogram_data": histogram_data,
 
387
  # Pass filter state for URL persistence
388
  "search": search or "",
389
  "search_type": search_type,
 
138
 
139
 
140
  @lru_cache(maxsize=1)
141
+ def get_confidence_options() -> list[dict]:
142
+ """Compute confidence filter options from actual data distribution.
143
 
144
+ Uses percentiles so the UI adapts to any model's score range.
145
  """
146
  df = get_dataframe()
147
+ scores = df.filter(pl.col("is_new_dataset"))["confidence_score"]
148
+
149
+ options = [{"value": "0.5", "label": "All new datasets", "count": len(scores)}]
150
+
151
+ for pct_label, quantile in [("Top 75%", 0.25), ("Top 50%", 0.50), ("Top 25%", 0.75)]:
152
+ threshold = float(scores.quantile(quantile))
153
+ count = scores.filter(scores >= threshold).len()
154
+ options.append({
155
+ "value": f"{threshold:.2f}",
156
+ "label": pct_label,
157
+ "count": int(count),
158
+ })
159
+
160
+ options.append({"value": "0", "label": "All papers", "count": len(df)})
161
+ return options
162
 
163
 
164
  @lru_cache(maxsize=1)
 
380
  df = get_dataframe()
381
  categories = get_categories()
382
  histogram_data = get_histogram_data()
383
+ confidence_options = get_confidence_options()
384
 
385
  # Get stats
386
  total_papers = len(df)
 
394
  "total_papers": total_papers,
395
  "new_dataset_count": new_dataset_count,
396
  "histogram_data": histogram_data,
397
+ "confidence_options": confidence_options,
398
  # Pass filter state for URL persistence
399
  "search": search or "",
400
  "search_type": search_type,
templates/index.html CHANGED
@@ -84,10 +84,11 @@
84
  hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
85
  hx-indicator="#loading-indicator"
86
  hx-push-url="true">
87
- <option value="0.5" {% if min_confidence == '0.5' %}selected{% endif %}>All classified (&ge;50%)</option>
88
- <option value="0.9" {% if min_confidence == '0.9' %}selected{% endif %}>High confidence (&ge;90%)</option>
89
- <option value="0.99" {% if min_confidence == '0.99' %}selected{% endif %}>Very high (&ge;99%)</option>
90
- <option value="0" {% if min_confidence == '0' %}selected{% endif %}>All papers</option>
 
91
  </select>
92
 
93
  <!-- Since filter dropdown -->
 
84
  hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
85
  hx-indicator="#loading-indicator"
86
  hx-push-url="true">
87
+ {% for opt in confidence_options %}
88
+ <option value="{{ opt.value }}" {% if min_confidence == opt.value %}selected{% endif %}>
89
+ {{ opt.label }}
90
+ </option>
91
+ {% endfor %}
92
  </select>
93
 
94
  <!-- Since filter dropdown -->