davanstrien HF Staff Claude Opus 4.6 commited on
Commit
fa3f012
·
1 Parent(s): 0aea3fd

Fix: filter on is_new_dataset when showing dataset papers

Browse files

confidence_score stores the max class probability (always >= 0.5 for
binary classification), so filtering on confidence >= 0.5 returned all
212K papers instead of just the 49K new dataset papers. Now requires
is_new_dataset = True when confidence threshold >= 0.5.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +10 -2
app.py CHANGED
@@ -268,7 +268,12 @@ def filter_papers(
268
  - Setting threshold to 0 shows all papers
269
  - Setting threshold >= 0.5 effectively shows only new_dataset papers
270
  """
271
- if min_confidence > 0:
 
 
 
 
 
272
  df = df.filter(pl.col("confidence_score") >= min_confidence)
273
 
274
  if category:
@@ -334,7 +339,10 @@ def semantic_search(
334
 
335
  # Build SQL filter (Lance supports SQL-like syntax)
336
  filters = []
337
- if min_confidence > 0:
 
 
 
338
  filters.append(f"confidence_score >= {min_confidence}")
339
  if category:
340
  # Escape single quotes in category name for SQL safety
 
268
  - Setting threshold to 0 shows all papers
269
  - Setting threshold >= 0.5 effectively shows only new_dataset papers
270
  """
271
+ if min_confidence >= 0.5:
272
+ # Show only papers classified as new datasets, filtered by confidence
273
+ df = df.filter(
274
+ pl.col("is_new_dataset") & (pl.col("confidence_score") >= min_confidence)
275
+ )
276
+ elif min_confidence > 0:
277
  df = df.filter(pl.col("confidence_score") >= min_confidence)
278
 
279
  if category:
 
339
 
340
  # Build SQL filter (Lance supports SQL-like syntax)
341
  filters = []
342
+ if min_confidence >= 0.5:
343
+ filters.append("is_new_dataset = true")
344
+ filters.append(f"confidence_score >= {min_confidence}")
345
+ elif min_confidence > 0:
346
  filters.append(f"confidence_score >= {min_confidence}")
347
  if category:
348
  # Escape single quotes in category name for SQL safety