Commit ·
fa3f012
1
Parent(s): 0aea3fd
Fix: filter on is_new_dataset when showing dataset papers
Browse filesconfidence_score stores the max class probability (always >= 0.5 for
binary classification), so filtering on confidence >= 0.5 returned all
212K papers instead of just the 49K new dataset papers. Now requires
is_new_dataset = True when confidence threshold >= 0.5.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
app.py
CHANGED
|
@@ -268,7 +268,12 @@ def filter_papers(
|
|
| 268 |
- Setting threshold to 0 shows all papers
|
| 269 |
- Setting threshold >= 0.5 effectively shows only new_dataset papers
|
| 270 |
"""
|
| 271 |
-
if min_confidence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
df = df.filter(pl.col("confidence_score") >= min_confidence)
|
| 273 |
|
| 274 |
if category:
|
|
@@ -334,7 +339,10 @@ def semantic_search(
|
|
| 334 |
|
| 335 |
# Build SQL filter (Lance supports SQL-like syntax)
|
| 336 |
filters = []
|
| 337 |
-
if min_confidence
|
|
|
|
|
|
|
|
|
|
| 338 |
filters.append(f"confidence_score >= {min_confidence}")
|
| 339 |
if category:
|
| 340 |
# Escape single quotes in category name for SQL safety
|
|
|
|
| 268 |
- Setting threshold to 0 shows all papers
|
| 269 |
- Setting threshold >= 0.5 effectively shows only new_dataset papers
|
| 270 |
"""
|
| 271 |
+
if min_confidence >= 0.5:
|
| 272 |
+
# Show only papers classified as new datasets, filtered by confidence
|
| 273 |
+
df = df.filter(
|
| 274 |
+
pl.col("is_new_dataset") & (pl.col("confidence_score") >= min_confidence)
|
| 275 |
+
)
|
| 276 |
+
elif min_confidence > 0:
|
| 277 |
df = df.filter(pl.col("confidence_score") >= min_confidence)
|
| 278 |
|
| 279 |
if category:
|
|
|
|
| 339 |
|
| 340 |
# Build SQL filter (Lance supports SQL-like syntax)
|
| 341 |
filters = []
|
| 342 |
+
if min_confidence >= 0.5:
|
| 343 |
+
filters.append("is_new_dataset = true")
|
| 344 |
+
filters.append(f"confidence_score >= {min_confidence}")
|
| 345 |
+
elif min_confidence > 0:
|
| 346 |
filters.append(f"confidence_score >= {min_confidence}")
|
| 347 |
if category:
|
| 348 |
# Escape single quotes in category name for SQL safety
|