Datasets-in-machine-learning

Sleeping

davanstrien HF Staff Claude Opus 4.6 commited on 23 days ago

Commit

0aea3fd

1 Parent(s): dab5e76

Use dynamic percentile-based confidence filter options

Replaces hardcoded confidence thresholds (50/90/99%) with percentile-based
options (Top 75%/50%/25%) computed from actual data distribution. This
adapts automatically when the underlying model changes (e.g., v2 → v3).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

app.py +20 -9
templates/index.html +5 -4

app.py CHANGED Viewed

@@ -138,18 +138,27 @@ def get_categories() -> list[str]:
 @lru_cache(maxsize=1)
-def get_confidence_counts() -> dict[str, int]:
-    """Count papers at each confidence threshold (for Tufte-style filter).
-    Thresholds chosen based on v2 model distribution (avg ~97% confidence).
     """
     df = get_dataframe()
-    new_datasets = df.filter(pl.col("is_new_dataset"))
-    thresholds = [0.5, 0.8, 0.9, 0.95, 0.99]
-    return {
-        str(t): new_datasets.filter(pl.col("confidence_score") >= t).height
-        for t in thresholds
-    }
 @lru_cache(maxsize=1)
@@ -371,6 +380,7 @@ async def home(
     df = get_dataframe()
     categories = get_categories()
     histogram_data = get_histogram_data()
     # Get stats
     total_papers = len(df)
@@ -384,6 +394,7 @@ async def home(
             "total_papers": total_papers,
             "new_dataset_count": new_dataset_count,
             "histogram_data": histogram_data,
             # Pass filter state for URL persistence
             "search": search or "",
             "search_type": search_type,

 @lru_cache(maxsize=1)
+def get_confidence_options() -> list[dict]:
+    """Compute confidence filter options from actual data distribution.
+    Uses percentiles so the UI adapts to any model's score range.
     """
     df = get_dataframe()
+    scores = df.filter(pl.col("is_new_dataset"))["confidence_score"]
+    options = [{"value": "0.5", "label": "All new datasets", "count": len(scores)}]
+    for pct_label, quantile in [("Top 75%", 0.25), ("Top 50%", 0.50), ("Top 25%", 0.75)]:
+        threshold = float(scores.quantile(quantile))
+        count = scores.filter(scores >= threshold).len()
+        options.append({
+            "value": f"{threshold:.2f}",
+            "label": pct_label,
+            "count": int(count),
+        })
+    options.append({"value": "0", "label": "All papers", "count": len(df)})
+    return options
 @lru_cache(maxsize=1)
     df = get_dataframe()
     categories = get_categories()
     histogram_data = get_histogram_data()
+    confidence_options = get_confidence_options()
     # Get stats
     total_papers = len(df)
             "total_papers": total_papers,
             "new_dataset_count": new_dataset_count,
             "histogram_data": histogram_data,
+            "confidence_options": confidence_options,
             # Pass filter state for URL persistence
             "search": search or "",
             "search_type": search_type,

templates/index.html CHANGED Viewed

@@ -84,10 +84,11 @@
                 hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
                 hx-indicator="#loading-indicator"
                 hx-push-url="true">
-            <option value="0.5" {% if min_confidence == '0.5' %}selected{% endif %}>All classified (&ge;50%)</option>
-            <option value="0.9" {% if min_confidence == '0.9' %}selected{% endif %}>High confidence (&ge;90%)</option>
-            <option value="0.99" {% if min_confidence == '0.99' %}selected{% endif %}>Very high (&ge;99%)</option>
-            <option value="0" {% if min_confidence == '0' %}selected{% endif %}>All papers</option>
         </select>
         <!-- Since filter dropdown -->

                 hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
                 hx-indicator="#loading-indicator"
                 hx-push-url="true">
+            {% for opt in confidence_options %}
+            <option value="{{ opt.value }}" {% if min_confidence == opt.value %}selected{% endif %}>
+                {{ opt.label }}
+            </option>
+            {% endfor %}
         </select>
         <!-- Since filter dropdown -->