Spaces:

davanstrien
/

arxiv-new-datasets

Sleeping

App Files Files Community

davanstrien HF Staff commited on Feb 5

Commit

70197b9

verified ·

1 Parent(s): caffb25

Dynamic confidence thresholds: percentile-based dropdown adapts to any model

Browse files

Files changed (7) hide show

Dockerfile +22 -13
app.py +35 -12
requirements.in +10 -0
requirements.txt +203 -11
templates/base.html +6 -1
templates/index.html +5 -3
templates/partials/paper_card.html +30 -8

Dockerfile CHANGED Viewed

@@ -1,26 +1,35 @@
 FROM python:3.11-slim
-WORKDIR /app
-# Enable HF transfer for faster downloads
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
-# Install uv for fast dependency management
-RUN pip install --no-cache-dir uv
-# Copy and install dependencies
-COPY requirements.txt .
-RUN uv pip install --system --no-cache -r requirements.txt
-# Copy application
-COPY . .
-# Create non-root user for security
 RUN useradd -m -u 1000 user
 USER user
 # HF Spaces expects port 7860
-ENV PORT=7860
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.11-slim
+# Install uv from official image (fast, no pip bootstrap needed)
+COPY --from=ghcr.io/astral-sh/uv:0.9.30 /uv /bin/uv
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
+# Install dependencies from requirements.in - resolved on the target platform,
+# no cross-compilation flags needed. CPU-only torch via extra index.
+COPY requirements.in /tmp/requirements.in
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system \
+    --extra-index-url https://download.pytorch.org/whl/cpu \
+    -r /tmp/requirements.in
+# Create non-root user (HF Spaces runs as user ID 1000)
 RUN useradd -m -u 1000 user
 USER user
+# Set home and path
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Copy application (as user)
+COPY --chown=user . .
+# Create data directory for dataset download
+RUN mkdir -p $HOME/app/data
 # HF Spaces expects port 7860
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ FastAPI + HTMX app for browsing arxiv papers with new ML datasets.
 Downloads Lance dataset from HuggingFace Hub and loads locally.
 """
 import re
 from datetime import date, timedelta
 from functools import lru_cache
@@ -46,9 +47,18 @@ def highlight_search(text: str, search: str) -> Markup:
     return Markup(highlighted)
-# Register custom filter
 templates.env.filters["highlight"] = highlight_search
 # Dataset config
 DATASET_REPO = "librarian-bots/arxiv-cs-papers-lance"
@@ -65,8 +75,10 @@ _model_cache: dict = {}
 def get_lance_dataset():
     """Download dataset from HF Hub (cached) and return Lance connection."""
     if "ds" not in _lance_cache:
-        # Use local_dir to get actual files, not symlinks (Lance needs real files)
-        local_dir = "./data/arxiv-lance"
         print(f"Downloading dataset from {DATASET_REPO} to {local_dir}...")
         snapshot_download(
             DATASET_REPO,
@@ -126,18 +138,27 @@ def get_categories() -> list[str]:
 @lru_cache(maxsize=1)
-def get_confidence_counts() -> dict[str, int]:
-    """Count papers at each confidence threshold (for Tufte-style filter).
-    Thresholds chosen based on actual data distribution (avg ~70% confidence).
     """
     df = get_dataframe()
-    new_datasets = df.filter(pl.col("is_new_dataset"))
-    thresholds = [0.5, 0.6, 0.65, 0.7, 0.71]
-    return {
-        str(t): new_datasets.filter(pl.col("confidence_score") >= t).height
-        for t in thresholds
-    }
 @lru_cache(maxsize=1)
@@ -359,6 +380,7 @@ async def home(
     df = get_dataframe()
     categories = get_categories()
     histogram_data = get_histogram_data()
     # Get stats
     total_papers = len(df)
@@ -372,6 +394,7 @@ async def home(
             "total_papers": total_papers,
             "new_dataset_count": new_dataset_count,
             "histogram_data": histogram_data,
             # Pass filter state for URL persistence
             "search": search or "",
             "search_type": search_type,

 Downloads Lance dataset from HuggingFace Hub and loads locally.
 """
+import math
 import re
 from datetime import date, timedelta
 from functools import lru_cache
     return Markup(highlighted)
+# Register custom filters
 templates.env.filters["highlight"] = highlight_search
+def confidence_fmt(score):
+    """Format confidence as percentage, truncating to 1 decimal to avoid rounding 99.95->100."""
+    pct = math.floor(score * 1000) / 10
+    return f"{pct:.1f}"
+templates.env.filters["confidence"] = confidence_fmt
 # Dataset config
 DATASET_REPO = "librarian-bots/arxiv-cs-papers-lance"
 def get_lance_dataset():
     """Download dataset from HF Hub (cached) and return Lance connection."""
     if "ds" not in _lance_cache:
+        import os
+        # Use HF_HOME or /tmp for Spaces compatibility (./data not writable on Spaces)
+        cache_base = os.environ.get("HF_HOME", "/tmp/hf_cache")
+        local_dir = f"{cache_base}/arxiv-lance"
         print(f"Downloading dataset from {DATASET_REPO} to {local_dir}...")
         snapshot_download(
             DATASET_REPO,
 @lru_cache(maxsize=1)
+def get_confidence_options() -> list[dict]:
+    """Compute confidence filter options from actual data distribution.
+    Uses percentiles so the UI adapts to any model's score range.
     """
     df = get_dataframe()
+    scores = df.filter(pl.col("is_new_dataset"))["confidence_score"]
+    options = [{"value": "0.5", "label": "All new datasets", "count": len(scores)}]
+    for pct_label, quantile in [("Top 75%", 0.25), ("Top 50%", 0.50), ("Top 25%", 0.75)]:
+        threshold = float(scores.quantile(quantile))
+        count = scores.filter(scores >= threshold).len()
+        options.append({
+            "value": f"{threshold:.2f}",
+            "label": pct_label,
+            "count": int(count),
+        })
+    options.append({"value": "0", "label": "All papers", "count": len(df)})
+    return options
 @lru_cache(maxsize=1)
     df = get_dataframe()
     categories = get_categories()
     histogram_data = get_histogram_data()
+    confidence_options = get_confidence_options()
     # Get stats
     total_papers = len(df)
             "total_papers": total_papers,
             "new_dataset_count": new_dataset_count,
             "histogram_data": histogram_data,
+            "confidence_options": confidence_options,
             # Pass filter state for URL persistence
             "search": search or "",
             "search_type": search_type,

requirements.in ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+pylance
+polars
+cachetools
+python-dotenv
+Jinja2
+markupsafe
+huggingface-hub
+sentence-transformers
+uvicorn

requirements.txt CHANGED Viewed

@@ -1,11 +1,203 @@
-fastapi
-uvicorn[standard]
-jinja2
-markupsafe
-polars
-huggingface-hub[hf_transfer]
-python-dotenv
-cachetools
-pyarrow
-pylance>=0.20
-sentence-transformers>=3.0

+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements.in -o requirements.txt --python-platform linux --python-version 3.11
+annotated-doc==0.0.4
+    # via fastapi
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.12.1
+    # via
+    #   httpx
+    #   starlette
+cachetools==7.0.0
+    # via -r requirements.in
+certifi==2026.1.4
+    # via
+    #   httpcore
+    #   httpx
+click==8.3.1
+    # via
+    #   typer-slim
+    #   uvicorn
+cuda-bindings==12.9.4
+    # via torch
+cuda-pathfinder==1.3.3
+    # via cuda-bindings
+fastapi==0.128.1
+    # via -r requirements.in
+filelock==3.20.3
+    # via
+    #   huggingface-hub
+    #   torch
+    #   transformers
+fsspec==2026.1.0
+    # via
+    #   huggingface-hub
+    #   torch
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.2.0
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via huggingface-hub
+huggingface-hub==1.4.0
+    # via
+    #   -r requirements.in
+    #   sentence-transformers
+    #   tokenizers
+    #   transformers
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+jinja2==3.1.6
+    # via
+    #   -r requirements.in
+    #   torch
+joblib==1.5.3
+    # via scikit-learn
+lance-namespace==0.4.5
+    # via pylance
+lance-namespace-urllib3-client==0.4.5
+    # via lance-namespace
+markupsafe==3.0.3
+    # via
+    #   -r requirements.in
+    #   jinja2
+mpmath==1.3.0
+    # via sympy
+networkx==3.6.1
+    # via torch
+numpy==2.4.2
+    # via
+    #   pylance
+    #   scikit-learn
+    #   scipy
+    #   sentence-transformers
+    #   transformers
+nvidia-cublas-cu12==12.8.4.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21
+    # via torch
+nvidia-cufft-cu12==11.3.3.83
+    # via torch
+nvidia-cufile-cu12==1.13.1.3
+    # via torch
+nvidia-curand-cu12==10.3.9.90
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1
+    # via torch
+nvidia-nccl-cu12==2.27.5
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvshmem-cu12==3.4.5
+    # via torch
+nvidia-nvtx-cu12==12.8.90
+    # via torch
+packaging==26.0
+    # via
+    #   huggingface-hub
+    #   transformers
+polars==1.38.0
+    # via -r requirements.in
+polars-runtime-32==1.38.0
+    # via polars
+pyarrow==23.0.0
+    # via pylance
+pydantic==2.12.5
+    # via
+    #   fastapi
+    #   lance-namespace-urllib3-client
+pydantic-core==2.41.5
+    # via pydantic
+pylance==1.0.4
+    # via -r requirements.in
+python-dateutil==2.9.0.post0
+    # via lance-namespace-urllib3-client
+python-dotenv==1.2.1
+    # via -r requirements.in
+pyyaml==6.0.3
+    # via
+    #   huggingface-hub
+    #   transformers
+regex==2026.1.15
+    # via transformers
+safetensors==0.7.0
+    # via transformers
+scikit-learn==1.8.0
+    # via sentence-transformers
+scipy==1.17.0
+    # via
+    #   scikit-learn
+    #   sentence-transformers
+sentence-transformers==5.2.2
+    # via -r requirements.in
+shellingham==1.5.4
+    # via huggingface-hub
+six==1.17.0
+    # via python-dateutil
+starlette==0.50.0
+    # via fastapi
+sympy==1.14.0
+    # via torch
+threadpoolctl==3.6.0
+    # via scikit-learn
+tokenizers==0.22.2
+    # via transformers
+torch==2.10.0
+    # via sentence-transformers
+tqdm==4.67.3
+    # via
+    #   huggingface-hub
+    #   sentence-transformers
+    #   transformers
+transformers==5.0.0
+    # via sentence-transformers
+triton==3.6.0
+    # via torch
+typer-slim==0.21.1
+    # via
+    #   huggingface-hub
+    #   transformers
+typing-extensions==4.15.0
+    # via
+    #   anyio
+    #   fastapi
+    #   huggingface-hub
+    #   lance-namespace-urllib3-client
+    #   pydantic
+    #   pydantic-core
+    #   sentence-transformers
+    #   starlette
+    #   torch
+    #   typer-slim
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via pydantic
+urllib3==2.6.3
+    # via lance-namespace-urllib3-client
+uvicorn==0.40.0
+    # via -r requirements.in

templates/base.html CHANGED Viewed

@@ -11,7 +11,12 @@
     <!-- HTMX -->
     <script src="https://unpkg.com/htmx.org@1.9.12"></script>
     <style>
         /* Loading indicator - subtle */
         .htmx-indicator { display: none; }
         .htmx-request .htmx-indicator,
@@ -35,7 +40,7 @@
     <footer class="border-t border-gray-100 mt-12">
         <div class="max-w-3xl mx-auto px-4 py-4 text-gray-400 text-xs">
-            <a href="https://huggingface.co/datasets/davanstrien/my-classified-papers" class="hover:text-gray-600">Data source</a>
             <span class="mx-2">·</span>
             <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="hover:text-gray-600">Model</a>
         </div>

     <!-- HTMX -->
     <script src="https://unpkg.com/htmx.org@1.9.12"></script>
+    <!-- Alpine.js (for expand/collapse) -->
+    <script defer src="https://unpkg.com/alpinejs@3.x.x/dist/cdn.min.js"></script>
     <style>
+        /* Alpine.js cloak (hide until loaded) */
+        [x-cloak] { display: none !important; }
         /* Loading indicator - subtle */
         .htmx-indicator { display: none; }
         .htmx-request .htmx-indicator,
     <footer class="border-t border-gray-100 mt-12">
         <div class="max-w-3xl mx-auto px-4 py-4 text-gray-400 text-xs">
+            <a href="https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance" class="hover:text-gray-600">Data source</a>
             <span class="mx-2">·</span>
             <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="hover:text-gray-600">Model</a>
         </div>

templates/index.html CHANGED Viewed

@@ -84,9 +84,11 @@
                 hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
                 hx-indicator="#loading-indicator"
                 hx-push-url="true">
-            <option value="0.5" {% if min_confidence == '0.5' %}selected{% endif %}>New datasets only</option>
-            <option value="0.6" {% if min_confidence == '0.6' %}selected{% endif %}>Higher confidence</option>
-            <option value="0" {% if min_confidence == '0' %}selected{% endif %}>All papers</option>
         </select>
         <!-- Since filter dropdown -->

                 hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
                 hx-indicator="#loading-indicator"
                 hx-push-url="true">
+            {% for opt in confidence_options %}
+            <option value="{{ opt.value }}" {% if min_confidence == opt.value %}selected{% endif %}>
+                {{ opt.label }}
+            </option>
+            {% endfor %}
         </select>
         <!-- Since filter dropdown -->

templates/partials/paper_card.html CHANGED Viewed

@@ -36,7 +36,7 @@
             </span>
         </span>
         <span class="text-gray-400 inline-flex items-center gap-1">
-            {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
             <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
                 <svg class="w-3.5 h-3.5 text-gray-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
                     <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
@@ -47,7 +47,7 @@
         </span>
         {% else %}
         <span class="{% if paper.confidence_score < 0.8 %}text-gray-400{% else %}text-gray-500{% endif %} inline-flex items-center gap-1">
-            {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
             <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
                 <svg class="w-3.5 h-3.5 {% if paper.confidence_score < 0.8 %}text-gray-300{% else %}text-gray-400{% endif %}" fill="none" stroke="currentColor" viewBox="0 0 24 24">
                     <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
@@ -59,12 +59,34 @@
         {% endif %}
     </div>
-    <!-- Abstract (truncated) -->
     <p class="mt-2 text-gray-600 text-sm leading-relaxed">
-        {% if search %}
-            {{ paper.abstract[:400]|highlight(search) }}{% if paper.abstract|length > 400 %}...{% endif %}
-        {% else %}
-            {{ paper.abstract[:400] }}{% if paper.abstract|length > 400 %}...{% endif %}
-        {% endif %}
     </p>
 </article>

             </span>
         </span>
         <span class="text-gray-400 inline-flex items-center gap-1">
+            {{ paper.confidence_score|confidence }}% conf.
             <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
                 <svg class="w-3.5 h-3.5 text-gray-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
                     <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
         </span>
         {% else %}
         <span class="{% if paper.confidence_score < 0.8 %}text-gray-400{% else %}text-gray-500{% endif %} inline-flex items-center gap-1">
+            {{ paper.confidence_score|confidence }}% conf.
             <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
                 <svg class="w-3.5 h-3.5 {% if paper.confidence_score < 0.8 %}text-gray-300{% else %}text-gray-400{% endif %}" fill="none" stroke="currentColor" viewBox="0 0 24 24">
                     <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
         {% endif %}
     </div>
+    <!-- Authors -->
+    {% if paper.authors %}
+    {% set author_list = paper.authors.split(', ') %}
+    <p class="mt-1 text-sm text-gray-500">
+        {{ author_list[:3]|join(', ') }}{% if author_list|length > 3 %} <span class="text-gray-400">et al.</span>{% endif %}
+    </p>
+    {% endif %}
+    <!-- Abstract (expandable) -->
+    {% if paper.abstract|length > 400 %}
+    <div class="mt-2" x-data="{ expanded: false }">
+        <p class="text-gray-600 text-sm leading-relaxed">
+            <span x-show="!expanded">
+                {% if search %}{{ paper.abstract[:400]|highlight(search) }}{% else %}{{ paper.abstract[:400] }}{% endif %}…
+            </span>
+            <span x-show="expanded" x-cloak>
+                {% if search %}{{ paper.abstract|highlight(search) }}{% else %}{{ paper.abstract }}{% endif %}
+            </span>
+            <button @click="expanded = !expanded"
+                    class="ml-1 text-gray-400 hover:text-gray-600 text-sm">
+                <span x-show="!expanded">Show more</span>
+                <span x-show="expanded" x-cloak>Show less</span>
+            </button>
+        </p>
+    </div>
+    {% else %}
     <p class="mt-2 text-gray-600 text-sm leading-relaxed">
+        {% if search %}{{ paper.abstract|highlight(search) }}{% else %}{{ paper.abstract }}{% endif %}
     </p>
+    {% endif %}
 </article>