davanstrien HF Staff commited on
Commit
70197b9
·
verified ·
1 Parent(s): caffb25

Dynamic confidence thresholds: percentile-based dropdown adapts to any model

Browse files
Dockerfile CHANGED
@@ -1,26 +1,35 @@
1
  FROM python:3.11-slim
2
 
3
- WORKDIR /app
 
4
 
5
- # Enable HF transfer for faster downloads
6
  ENV HF_HUB_ENABLE_HF_TRANSFER=1
7
 
8
- # Install uv for fast dependency management
9
- RUN pip install --no-cache-dir uv
 
 
 
 
 
10
 
11
- # Copy and install dependencies
12
- COPY requirements.txt .
13
- RUN uv pip install --system --no-cache -r requirements.txt
14
-
15
- # Copy application
16
- COPY . .
17
-
18
- # Create non-root user for security
19
  RUN useradd -m -u 1000 user
20
  USER user
21
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # HF Spaces expects port 7860
23
- ENV PORT=7860
24
  EXPOSE 7860
25
 
26
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.11-slim
2
 
3
+ # Install uv from official image (fast, no pip bootstrap needed)
4
+ COPY --from=ghcr.io/astral-sh/uv:0.9.30 /uv /bin/uv
5
 
 
6
  ENV HF_HUB_ENABLE_HF_TRANSFER=1
7
 
8
+ # Install dependencies from requirements.in - resolved on the target platform,
9
+ # no cross-compilation flags needed. CPU-only torch via extra index.
10
+ COPY requirements.in /tmp/requirements.in
11
+ RUN --mount=type=cache,target=/root/.cache/uv \
12
+ uv pip install --system \
13
+ --extra-index-url https://download.pytorch.org/whl/cpu \
14
+ -r /tmp/requirements.in
15
 
16
+ # Create non-root user (HF Spaces runs as user ID 1000)
 
 
 
 
 
 
 
17
  RUN useradd -m -u 1000 user
18
  USER user
19
 
20
+ # Set home and path
21
+ ENV HOME=/home/user \
22
+ PATH=/home/user/.local/bin:$PATH
23
+
24
+ WORKDIR $HOME/app
25
+
26
+ # Copy application (as user)
27
+ COPY --chown=user . .
28
+
29
+ # Create data directory for dataset download
30
+ RUN mkdir -p $HOME/app/data
31
+
32
  # HF Spaces expects port 7860
 
33
  EXPOSE 7860
34
 
35
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -3,6 +3,7 @@ FastAPI + HTMX app for browsing arxiv papers with new ML datasets.
3
  Downloads Lance dataset from HuggingFace Hub and loads locally.
4
  """
5
 
 
6
  import re
7
  from datetime import date, timedelta
8
  from functools import lru_cache
@@ -46,9 +47,18 @@ def highlight_search(text: str, search: str) -> Markup:
46
  return Markup(highlighted)
47
 
48
 
49
- # Register custom filter
50
  templates.env.filters["highlight"] = highlight_search
51
 
 
 
 
 
 
 
 
 
 
52
  # Dataset config
53
  DATASET_REPO = "librarian-bots/arxiv-cs-papers-lance"
54
 
@@ -65,8 +75,10 @@ _model_cache: dict = {}
65
  def get_lance_dataset():
66
  """Download dataset from HF Hub (cached) and return Lance connection."""
67
  if "ds" not in _lance_cache:
68
- # Use local_dir to get actual files, not symlinks (Lance needs real files)
69
- local_dir = "./data/arxiv-lance"
 
 
70
  print(f"Downloading dataset from {DATASET_REPO} to {local_dir}...")
71
  snapshot_download(
72
  DATASET_REPO,
@@ -126,18 +138,27 @@ def get_categories() -> list[str]:
126
 
127
 
128
  @lru_cache(maxsize=1)
129
- def get_confidence_counts() -> dict[str, int]:
130
- """Count papers at each confidence threshold (for Tufte-style filter).
131
 
132
- Thresholds chosen based on actual data distribution (avg ~70% confidence).
133
  """
134
  df = get_dataframe()
135
- new_datasets = df.filter(pl.col("is_new_dataset"))
136
- thresholds = [0.5, 0.6, 0.65, 0.7, 0.71]
137
- return {
138
- str(t): new_datasets.filter(pl.col("confidence_score") >= t).height
139
- for t in thresholds
140
- }
 
 
 
 
 
 
 
 
 
141
 
142
 
143
  @lru_cache(maxsize=1)
@@ -359,6 +380,7 @@ async def home(
359
  df = get_dataframe()
360
  categories = get_categories()
361
  histogram_data = get_histogram_data()
 
362
 
363
  # Get stats
364
  total_papers = len(df)
@@ -372,6 +394,7 @@ async def home(
372
  "total_papers": total_papers,
373
  "new_dataset_count": new_dataset_count,
374
  "histogram_data": histogram_data,
 
375
  # Pass filter state for URL persistence
376
  "search": search or "",
377
  "search_type": search_type,
 
3
  Downloads Lance dataset from HuggingFace Hub and loads locally.
4
  """
5
 
6
+ import math
7
  import re
8
  from datetime import date, timedelta
9
  from functools import lru_cache
 
47
  return Markup(highlighted)
48
 
49
 
50
+ # Register custom filters
51
  templates.env.filters["highlight"] = highlight_search
52
 
53
+
54
+ def confidence_fmt(score):
55
+ """Format confidence as percentage, truncating to 1 decimal to avoid rounding 99.95->100."""
56
+ pct = math.floor(score * 1000) / 10
57
+ return f"{pct:.1f}"
58
+
59
+
60
+ templates.env.filters["confidence"] = confidence_fmt
61
+
62
  # Dataset config
63
  DATASET_REPO = "librarian-bots/arxiv-cs-papers-lance"
64
 
 
75
  def get_lance_dataset():
76
  """Download dataset from HF Hub (cached) and return Lance connection."""
77
  if "ds" not in _lance_cache:
78
+ import os
79
+ # Use HF_HOME or /tmp for Spaces compatibility (./data not writable on Spaces)
80
+ cache_base = os.environ.get("HF_HOME", "/tmp/hf_cache")
81
+ local_dir = f"{cache_base}/arxiv-lance"
82
  print(f"Downloading dataset from {DATASET_REPO} to {local_dir}...")
83
  snapshot_download(
84
  DATASET_REPO,
 
138
 
139
 
140
  @lru_cache(maxsize=1)
141
+ def get_confidence_options() -> list[dict]:
142
+ """Compute confidence filter options from actual data distribution.
143
 
144
+ Uses percentiles so the UI adapts to any model's score range.
145
  """
146
  df = get_dataframe()
147
+ scores = df.filter(pl.col("is_new_dataset"))["confidence_score"]
148
+
149
+ options = [{"value": "0.5", "label": "All new datasets", "count": len(scores)}]
150
+
151
+ for pct_label, quantile in [("Top 75%", 0.25), ("Top 50%", 0.50), ("Top 25%", 0.75)]:
152
+ threshold = float(scores.quantile(quantile))
153
+ count = scores.filter(scores >= threshold).len()
154
+ options.append({
155
+ "value": f"{threshold:.2f}",
156
+ "label": pct_label,
157
+ "count": int(count),
158
+ })
159
+
160
+ options.append({"value": "0", "label": "All papers", "count": len(df)})
161
+ return options
162
 
163
 
164
  @lru_cache(maxsize=1)
 
380
  df = get_dataframe()
381
  categories = get_categories()
382
  histogram_data = get_histogram_data()
383
+ confidence_options = get_confidence_options()
384
 
385
  # Get stats
386
  total_papers = len(df)
 
394
  "total_papers": total_papers,
395
  "new_dataset_count": new_dataset_count,
396
  "histogram_data": histogram_data,
397
+ "confidence_options": confidence_options,
398
  # Pass filter state for URL persistence
399
  "search": search or "",
400
  "search_type": search_type,
requirements.in ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ pylance
3
+ polars
4
+ cachetools
5
+ python-dotenv
6
+ Jinja2
7
+ markupsafe
8
+ huggingface-hub
9
+ sentence-transformers
10
+ uvicorn
requirements.txt CHANGED
@@ -1,11 +1,203 @@
1
- fastapi
2
- uvicorn[standard]
3
- jinja2
4
- markupsafe
5
- polars
6
- huggingface-hub[hf_transfer]
7
- python-dotenv
8
- cachetools
9
- pyarrow
10
- pylance>=0.20
11
- sentence-transformers>=3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile requirements.in -o requirements.txt --python-platform linux --python-version 3.11
3
+ annotated-doc==0.0.4
4
+ # via fastapi
5
+ annotated-types==0.7.0
6
+ # via pydantic
7
+ anyio==4.12.1
8
+ # via
9
+ # httpx
10
+ # starlette
11
+ cachetools==7.0.0
12
+ # via -r requirements.in
13
+ certifi==2026.1.4
14
+ # via
15
+ # httpcore
16
+ # httpx
17
+ click==8.3.1
18
+ # via
19
+ # typer-slim
20
+ # uvicorn
21
+ cuda-bindings==12.9.4
22
+ # via torch
23
+ cuda-pathfinder==1.3.3
24
+ # via cuda-bindings
25
+ fastapi==0.128.1
26
+ # via -r requirements.in
27
+ filelock==3.20.3
28
+ # via
29
+ # huggingface-hub
30
+ # torch
31
+ # transformers
32
+ fsspec==2026.1.0
33
+ # via
34
+ # huggingface-hub
35
+ # torch
36
+ h11==0.16.0
37
+ # via
38
+ # httpcore
39
+ # uvicorn
40
+ hf-xet==1.2.0
41
+ # via huggingface-hub
42
+ httpcore==1.0.9
43
+ # via httpx
44
+ httpx==0.28.1
45
+ # via huggingface-hub
46
+ huggingface-hub==1.4.0
47
+ # via
48
+ # -r requirements.in
49
+ # sentence-transformers
50
+ # tokenizers
51
+ # transformers
52
+ idna==3.11
53
+ # via
54
+ # anyio
55
+ # httpx
56
+ jinja2==3.1.6
57
+ # via
58
+ # -r requirements.in
59
+ # torch
60
+ joblib==1.5.3
61
+ # via scikit-learn
62
+ lance-namespace==0.4.5
63
+ # via pylance
64
+ lance-namespace-urllib3-client==0.4.5
65
+ # via lance-namespace
66
+ markupsafe==3.0.3
67
+ # via
68
+ # -r requirements.in
69
+ # jinja2
70
+ mpmath==1.3.0
71
+ # via sympy
72
+ networkx==3.6.1
73
+ # via torch
74
+ numpy==2.4.2
75
+ # via
76
+ # pylance
77
+ # scikit-learn
78
+ # scipy
79
+ # sentence-transformers
80
+ # transformers
81
+ nvidia-cublas-cu12==12.8.4.1
82
+ # via
83
+ # nvidia-cudnn-cu12
84
+ # nvidia-cusolver-cu12
85
+ # torch
86
+ nvidia-cuda-cupti-cu12==12.8.90
87
+ # via torch
88
+ nvidia-cuda-nvrtc-cu12==12.8.93
89
+ # via torch
90
+ nvidia-cuda-runtime-cu12==12.8.90
91
+ # via torch
92
+ nvidia-cudnn-cu12==9.10.2.21
93
+ # via torch
94
+ nvidia-cufft-cu12==11.3.3.83
95
+ # via torch
96
+ nvidia-cufile-cu12==1.13.1.3
97
+ # via torch
98
+ nvidia-curand-cu12==10.3.9.90
99
+ # via torch
100
+ nvidia-cusolver-cu12==11.7.3.90
101
+ # via torch
102
+ nvidia-cusparse-cu12==12.5.8.93
103
+ # via
104
+ # nvidia-cusolver-cu12
105
+ # torch
106
+ nvidia-cusparselt-cu12==0.7.1
107
+ # via torch
108
+ nvidia-nccl-cu12==2.27.5
109
+ # via torch
110
+ nvidia-nvjitlink-cu12==12.8.93
111
+ # via
112
+ # nvidia-cufft-cu12
113
+ # nvidia-cusolver-cu12
114
+ # nvidia-cusparse-cu12
115
+ # torch
116
+ nvidia-nvshmem-cu12==3.4.5
117
+ # via torch
118
+ nvidia-nvtx-cu12==12.8.90
119
+ # via torch
120
+ packaging==26.0
121
+ # via
122
+ # huggingface-hub
123
+ # transformers
124
+ polars==1.38.0
125
+ # via -r requirements.in
126
+ polars-runtime-32==1.38.0
127
+ # via polars
128
+ pyarrow==23.0.0
129
+ # via pylance
130
+ pydantic==2.12.5
131
+ # via
132
+ # fastapi
133
+ # lance-namespace-urllib3-client
134
+ pydantic-core==2.41.5
135
+ # via pydantic
136
+ pylance==1.0.4
137
+ # via -r requirements.in
138
+ python-dateutil==2.9.0.post0
139
+ # via lance-namespace-urllib3-client
140
+ python-dotenv==1.2.1
141
+ # via -r requirements.in
142
+ pyyaml==6.0.3
143
+ # via
144
+ # huggingface-hub
145
+ # transformers
146
+ regex==2026.1.15
147
+ # via transformers
148
+ safetensors==0.7.0
149
+ # via transformers
150
+ scikit-learn==1.8.0
151
+ # via sentence-transformers
152
+ scipy==1.17.0
153
+ # via
154
+ # scikit-learn
155
+ # sentence-transformers
156
+ sentence-transformers==5.2.2
157
+ # via -r requirements.in
158
+ shellingham==1.5.4
159
+ # via huggingface-hub
160
+ six==1.17.0
161
+ # via python-dateutil
162
+ starlette==0.50.0
163
+ # via fastapi
164
+ sympy==1.14.0
165
+ # via torch
166
+ threadpoolctl==3.6.0
167
+ # via scikit-learn
168
+ tokenizers==0.22.2
169
+ # via transformers
170
+ torch==2.10.0
171
+ # via sentence-transformers
172
+ tqdm==4.67.3
173
+ # via
174
+ # huggingface-hub
175
+ # sentence-transformers
176
+ # transformers
177
+ transformers==5.0.0
178
+ # via sentence-transformers
179
+ triton==3.6.0
180
+ # via torch
181
+ typer-slim==0.21.1
182
+ # via
183
+ # huggingface-hub
184
+ # transformers
185
+ typing-extensions==4.15.0
186
+ # via
187
+ # anyio
188
+ # fastapi
189
+ # huggingface-hub
190
+ # lance-namespace-urllib3-client
191
+ # pydantic
192
+ # pydantic-core
193
+ # sentence-transformers
194
+ # starlette
195
+ # torch
196
+ # typer-slim
197
+ # typing-inspection
198
+ typing-inspection==0.4.2
199
+ # via pydantic
200
+ urllib3==2.6.3
201
+ # via lance-namespace-urllib3-client
202
+ uvicorn==0.40.0
203
+ # via -r requirements.in
templates/base.html CHANGED
@@ -11,7 +11,12 @@
11
  <!-- HTMX -->
12
  <script src="https://unpkg.com/htmx.org@1.9.12"></script>
13
 
 
 
 
14
  <style>
 
 
15
  /* Loading indicator - subtle */
16
  .htmx-indicator { display: none; }
17
  .htmx-request .htmx-indicator,
@@ -35,7 +40,7 @@
35
 
36
  <footer class="border-t border-gray-100 mt-12">
37
  <div class="max-w-3xl mx-auto px-4 py-4 text-gray-400 text-xs">
38
- <a href="https://huggingface.co/datasets/davanstrien/my-classified-papers" class="hover:text-gray-600">Data source</a>
39
  <span class="mx-2">·</span>
40
  <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="hover:text-gray-600">Model</a>
41
  </div>
 
11
  <!-- HTMX -->
12
  <script src="https://unpkg.com/htmx.org@1.9.12"></script>
13
 
14
+ <!-- Alpine.js (for expand/collapse) -->
15
+ <script defer src="https://unpkg.com/alpinejs@3.x.x/dist/cdn.min.js"></script>
16
+
17
  <style>
18
+ /* Alpine.js cloak (hide until loaded) */
19
+ [x-cloak] { display: none !important; }
20
  /* Loading indicator - subtle */
21
  .htmx-indicator { display: none; }
22
  .htmx-request .htmx-indicator,
 
40
 
41
  <footer class="border-t border-gray-100 mt-12">
42
  <div class="max-w-3xl mx-auto px-4 py-4 text-gray-400 text-xs">
43
+ <a href="https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance" class="hover:text-gray-600">Data source</a>
44
  <span class="mx-2">·</span>
45
  <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="hover:text-gray-600">Model</a>
46
  </div>
templates/index.html CHANGED
@@ -84,9 +84,11 @@
84
  hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
85
  hx-indicator="#loading-indicator"
86
  hx-push-url="true">
87
- <option value="0.5" {% if min_confidence == '0.5' %}selected{% endif %}>New datasets only</option>
88
- <option value="0.6" {% if min_confidence == '0.6' %}selected{% endif %}>Higher confidence</option>
89
- <option value="0" {% if min_confidence == '0' %}selected{% endif %}>All papers</option>
 
 
90
  </select>
91
 
92
  <!-- Since filter dropdown -->
 
84
  hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
85
  hx-indicator="#loading-indicator"
86
  hx-push-url="true">
87
+ {% for opt in confidence_options %}
88
+ <option value="{{ opt.value }}" {% if min_confidence == opt.value %}selected{% endif %}>
89
+ {{ opt.label }}
90
+ </option>
91
+ {% endfor %}
92
  </select>
93
 
94
  <!-- Since filter dropdown -->
templates/partials/paper_card.html CHANGED
@@ -36,7 +36,7 @@
36
  </span>
37
  </span>
38
  <span class="text-gray-400 inline-flex items-center gap-1">
39
- {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
40
  <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
41
  <svg class="w-3.5 h-3.5 text-gray-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
42
  <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
@@ -47,7 +47,7 @@
47
  </span>
48
  {% else %}
49
  <span class="{% if paper.confidence_score < 0.8 %}text-gray-400{% else %}text-gray-500{% endif %} inline-flex items-center gap-1">
50
- {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
51
  <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
52
  <svg class="w-3.5 h-3.5 {% if paper.confidence_score < 0.8 %}text-gray-300{% else %}text-gray-400{% endif %}" fill="none" stroke="currentColor" viewBox="0 0 24 24">
53
  <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
@@ -59,12 +59,34 @@
59
  {% endif %}
60
  </div>
61
 
62
- <!-- Abstract (truncated) -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  <p class="mt-2 text-gray-600 text-sm leading-relaxed">
64
- {% if search %}
65
- {{ paper.abstract[:400]|highlight(search) }}{% if paper.abstract|length > 400 %}...{% endif %}
66
- {% else %}
67
- {{ paper.abstract[:400] }}{% if paper.abstract|length > 400 %}...{% endif %}
68
- {% endif %}
69
  </p>
 
70
  </article>
 
36
  </span>
37
  </span>
38
  <span class="text-gray-400 inline-flex items-center gap-1">
39
+ {{ paper.confidence_score|confidence }}% conf.
40
  <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
41
  <svg class="w-3.5 h-3.5 text-gray-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
42
  <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
 
47
  </span>
48
  {% else %}
49
  <span class="{% if paper.confidence_score < 0.8 %}text-gray-400{% else %}text-gray-500{% endif %} inline-flex items-center gap-1">
50
+ {{ paper.confidence_score|confidence }}% conf.
51
  <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
52
  <svg class="w-3.5 h-3.5 {% if paper.confidence_score < 0.8 %}text-gray-300{% else %}text-gray-400{% endif %}" fill="none" stroke="currentColor" viewBox="0 0 24 24">
53
  <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
 
59
  {% endif %}
60
  </div>
61
 
62
+ <!-- Authors -->
63
+ {% if paper.authors %}
64
+ {% set author_list = paper.authors.split(', ') %}
65
+ <p class="mt-1 text-sm text-gray-500">
66
+ {{ author_list[:3]|join(', ') }}{% if author_list|length > 3 %} <span class="text-gray-400">et al.</span>{% endif %}
67
+ </p>
68
+ {% endif %}
69
+
70
+ <!-- Abstract (expandable) -->
71
+ {% if paper.abstract|length > 400 %}
72
+ <div class="mt-2" x-data="{ expanded: false }">
73
+ <p class="text-gray-600 text-sm leading-relaxed">
74
+ <span x-show="!expanded">
75
+ {% if search %}{{ paper.abstract[:400]|highlight(search) }}{% else %}{{ paper.abstract[:400] }}{% endif %}…
76
+ </span>
77
+ <span x-show="expanded" x-cloak>
78
+ {% if search %}{{ paper.abstract|highlight(search) }}{% else %}{{ paper.abstract }}{% endif %}
79
+ </span>
80
+ <button @click="expanded = !expanded"
81
+ class="ml-1 text-gray-400 hover:text-gray-600 text-sm">
82
+ <span x-show="!expanded">Show more</span>
83
+ <span x-show="expanded" x-cloak>Show less</span>
84
+ </button>
85
+ </p>
86
+ </div>
87
+ {% else %}
88
  <p class="mt-2 text-gray-600 text-sm leading-relaxed">
89
+ {% if search %}{{ paper.abstract|highlight(search) }}{% else %}{{ paper.abstract }}{% endif %}
 
 
 
 
90
  </p>
91
+ {% endif %}
92
  </article>