Commit ·
0aea3fd
1
Parent(s): dab5e76
Use dynamic percentile-based confidence filter options
Browse filesReplaces hardcoded confidence thresholds (50/90/99%) with percentile-based
options (Top 75%/50%/25%) computed from actual data distribution. This
adapts automatically when the underlying model changes (e.g., v2 → v3).
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- app.py +20 -9
- templates/index.html +5 -4
app.py
CHANGED
|
@@ -138,18 +138,27 @@ def get_categories() -> list[str]:
|
|
| 138 |
|
| 139 |
|
| 140 |
@lru_cache(maxsize=1)
|
| 141 |
-
def
|
| 142 |
-
"""
|
| 143 |
|
| 144 |
-
|
| 145 |
"""
|
| 146 |
df = get_dataframe()
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
@lru_cache(maxsize=1)
|
|
@@ -371,6 +380,7 @@ async def home(
|
|
| 371 |
df = get_dataframe()
|
| 372 |
categories = get_categories()
|
| 373 |
histogram_data = get_histogram_data()
|
|
|
|
| 374 |
|
| 375 |
# Get stats
|
| 376 |
total_papers = len(df)
|
|
@@ -384,6 +394,7 @@ async def home(
|
|
| 384 |
"total_papers": total_papers,
|
| 385 |
"new_dataset_count": new_dataset_count,
|
| 386 |
"histogram_data": histogram_data,
|
|
|
|
| 387 |
# Pass filter state for URL persistence
|
| 388 |
"search": search or "",
|
| 389 |
"search_type": search_type,
|
|
|
|
| 138 |
|
| 139 |
|
| 140 |
@lru_cache(maxsize=1)
|
| 141 |
+
def get_confidence_options() -> list[dict]:
|
| 142 |
+
"""Compute confidence filter options from actual data distribution.
|
| 143 |
|
| 144 |
+
Uses percentiles so the UI adapts to any model's score range.
|
| 145 |
"""
|
| 146 |
df = get_dataframe()
|
| 147 |
+
scores = df.filter(pl.col("is_new_dataset"))["confidence_score"]
|
| 148 |
+
|
| 149 |
+
options = [{"value": "0.5", "label": "All new datasets", "count": len(scores)}]
|
| 150 |
+
|
| 151 |
+
for pct_label, quantile in [("Top 75%", 0.25), ("Top 50%", 0.50), ("Top 25%", 0.75)]:
|
| 152 |
+
threshold = float(scores.quantile(quantile))
|
| 153 |
+
count = scores.filter(scores >= threshold).len()
|
| 154 |
+
options.append({
|
| 155 |
+
"value": f"{threshold:.2f}",
|
| 156 |
+
"label": pct_label,
|
| 157 |
+
"count": int(count),
|
| 158 |
+
})
|
| 159 |
+
|
| 160 |
+
options.append({"value": "0", "label": "All papers", "count": len(df)})
|
| 161 |
+
return options
|
| 162 |
|
| 163 |
|
| 164 |
@lru_cache(maxsize=1)
|
|
|
|
| 380 |
df = get_dataframe()
|
| 381 |
categories = get_categories()
|
| 382 |
histogram_data = get_histogram_data()
|
| 383 |
+
confidence_options = get_confidence_options()
|
| 384 |
|
| 385 |
# Get stats
|
| 386 |
total_papers = len(df)
|
|
|
|
| 394 |
"total_papers": total_papers,
|
| 395 |
"new_dataset_count": new_dataset_count,
|
| 396 |
"histogram_data": histogram_data,
|
| 397 |
+
"confidence_options": confidence_options,
|
| 398 |
# Pass filter state for URL persistence
|
| 399 |
"search": search or "",
|
| 400 |
"search_type": search_type,
|
templates/index.html
CHANGED
|
@@ -84,10 +84,11 @@
|
|
| 84 |
hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
|
| 85 |
hx-indicator="#loading-indicator"
|
| 86 |
hx-push-url="true">
|
| 87 |
-
|
| 88 |
-
<option value="
|
| 89 |
-
|
| 90 |
-
|
|
|
|
| 91 |
</select>
|
| 92 |
|
| 93 |
<!-- Since filter dropdown -->
|
|
|
|
| 84 |
hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
|
| 85 |
hx-indicator="#loading-indicator"
|
| 86 |
hx-push-url="true">
|
| 87 |
+
{% for opt in confidence_options %}
|
| 88 |
+
<option value="{{ opt.value }}" {% if min_confidence == opt.value %}selected{% endif %}>
|
| 89 |
+
{{ opt.label }}
|
| 90 |
+
</option>
|
| 91 |
+
{% endfor %}
|
| 92 |
</select>
|
| 93 |
|
| 94 |
<!-- Since filter dropdown -->
|