Datasets-in-machine-learning

Paused

App Files Files Community

davanstrien HF Staff commited on Feb 3

Commit

4cf63e7

verified ·

1 Parent(s): fc9caf8

Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

.gitignore +2 -0
Dockerfile +26 -0
README.md +29 -9
app.py +484 -177
requirements.txt +11 -354
static/styles.css +12 -0
templates/base.html +44 -0
templates/index.html +137 -0
templates/partials/paper_card.html +70 -0
templates/partials/paper_list.html +33 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ data/

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.11-slim
+WORKDIR /app
+# Enable HF transfer for faster downloads
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+# Install uv for fast dependency management
+RUN pip install --no-cache-dir uv
+# Copy and install dependencies
+COPY requirements.txt .
+RUN uv pip install --system --no-cache -r requirements.txt
+# Copy application
+COPY . .
+# Create non-root user for security
+RUN useradd -m -u 1000 user
+USER user
+# HF Spaces expects port 7860
+ENV PORT=7860
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,13 +1,33 @@
 ---
-title: New Datasets in Machine Learning
-emoji: 💽
-colorFrom: pink
-colorTo: red
-sdk: gradio
-sdk_version: 4.36.1
-app_file: app.py
 pinned: false
-python_version: 3.11.1
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ArXiv New ML Datasets
+emoji: 📚
+colorFrom: blue
+colorTo: indigo
+sdk: docker
 pinned: false
+license: mit
 ---
+# ArXiv New ML Datasets
+Browse **1.1M+ CS papers** from arXiv, with **50,000+ classified** as introducing new machine learning datasets.
+## Features
+- **Keyword search** - Search titles and abstracts
+- **Semantic search** - Find conceptually similar papers using vector embeddings
+- **Filter** by arXiv category (cs.AI, cs.CV, cs.LG, etc.)
+- **Infinite scroll** for smooth browsing
+- Links to arXiv, PDF, and HF Papers
+## Data Source
+Papers classified using [ModernBERT](https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset). Embeddings from [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5).
+Data from [librarian-bots/arxiv-cs-papers-lance](https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance). Updated weekly.
+## Tech Stack
+- **Backend**: FastAPI + Polars + Lance
+- **Frontend**: HTMX + Tailwind CSS
+- **Vector Search**: Lance with IVF_PQ index

app.py CHANGED Viewed

@@ -1,205 +1,512 @@
-import os
-import arxiv
-import gradio as gr
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from cachetools import TTLCache, cached
-from setfit import SetFitModel
-from tqdm.auto import tqdm
-import stamina
-from arxiv import UnexpectedEmptyPageError, ArxivError
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-CACHE_TIME = 60 * 60 * 12  # 12 hours
-MAX_RESULTS = 300
-client = arxiv.Client(page_size=50, delay_seconds=3, num_retries=2)
-@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
-def get_arxiv_result():
-    return _get_arxiv_result()
-@stamina.retry(
-    on=(ValueError, UnexpectedEmptyPageError, ArxivError), attempts=10, wait_max=60 * 15
-)
-def _get_arxiv_result():
-    results = [
-        {
-            "title": result.title,
-            "abstract": result.summary,
-            "url": result.entry_id,
-            "category": result.primary_category,
-            "updated": result.updated,
-        }
-        for result in tqdm(
-            client.results(
-                arxiv.Search(
-                    query="ti:dataset",
-                    max_results=MAX_RESULTS,
-                    sort_by=arxiv.SortCriterion.SubmittedDate,
-                )
-            ),
-            total=MAX_RESULTS,
         )
     ]
-    if len(results) > 1:
-        return results
-    else:
-        raise ValueError("No results found")
-    # return [
-    #     {
-    #         "title": result.title,
-    #         "abstract": result.summary,
-    #         "url": result.entry_id,
-    #         "category": result.primary_category,
-    #         "updated": result.updated,
-    #     }
-    #     for result in tqdm(search.results(), total=MAX_RESULTS)
-    # ]
-def load_model():
-    return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model")
-def format_row_for_model(row):
-    return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}"
-int2label = {0: "new_dataset", 1: "not_new_dataset"}
-def get_predictions(data: list[dict], model=None, batch_size=128):
-    if model is None:
-        model = load_model()
-    predictions = []
-    for i in tqdm(range(0, len(data), batch_size)):
-        batch = data[i : i + batch_size]
-        text_inputs = [format_row_for_model(row) for row in batch]
-        batch_predictions = model.predict_proba(text_inputs)
-        for j, row in enumerate(batch):
-            prediction = batch_predictions[j]
-            row["prediction"] = int2label[int(prediction.argmax())]
-            row["probability"] = float(prediction.max())
-            predictions.append(row)
-    return predictions
-def create_markdown(row):
-    title = row["title"]
-    abstract = row["abstract"]
-    arxiv_id = row["arxiv_id"]
-    hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
-    updated = row["updated"]
-    updated = updated.strftime("%Y-%m-%d")
-    broad_category = row["broad_category"]
-    category = row["category"]
-    return f""" <h2> {title} </h2> Updated: {updated}
-    | Category: {broad_category}  | Subcategory: {category} |
-\n\n{abstract}
-\n\n [Hugging Face Papers page]({hub_paper_url})
     """
-@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
-def prepare_data():
-    print("Downloading arxiv results...")
-    arxiv_results = get_arxiv_result()
-    print("loading model...")
-    model = load_model()
-    print("Making predictions...")
-    predictions = get_predictions(arxiv_results, model=model)
-    df = pd.DataFrame(predictions)
-    df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
-    df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
-    df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
     return df
-all_possible_arxiv_categories = sorted(prepare_data().category.unique().tolist())
-broad_categories = sorted(prepare_data().broad_category.unique().tolist())
-# @list_cacheable
-def create_markdown_summary(categories=None, new_only=True, narrow_categories=None):
-    df = prepare_data()
-    if new_only:
-        df = df[df["prediction"] == "new_dataset"]
-    if narrow_categories is not None:
-        df = df[df["category"].isin(narrow_categories)]
-    if categories is not None and not narrow_categories:
-        df = prepare_data()
-        if new_only:
-            df = df[df["prediction"] == "new_dataset"]
-        df = df[df["broad_category"].isin(categories)]
-    number_of_results = len(df)
-    results = (
-        "<h1 style='text-align: center'> arXiv papers related to datasets</h1> \n\n"
-    )
-    results += f"Number of results: {number_of_results}\n\n"
-    results += "\n\n<br>".join(df["markdown"].tolist())
-    return results
-scheduler = BackgroundScheduler()
-scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
-scheduler.start()
-description = """This Space shows recent papers on arXiv that are *likely* to be papers introducing new datasets related to machine learning. \n\n
-The Space works by:
-- searching for papers on arXiv with the term `dataset` in the title + "machine learning" in the abstract
-- passing the abstract and title of the papers to a machine learning model that predicts if the paper is introducing a new dataset or not
-This Space is a work in progress. The model is not perfect, and the search query is not perfect. If you have  suggestions for how to improve this Space, please open a Discussion.\n\n"""
-with gr.Blocks() as demo:
-    gr.Markdown(
-        "<h1 style='text-align: center'>  &#x2728;New Datasets in Machine Learning "
-        " &#x2728; </h1>"
     )
-    gr.Markdown(description)
-    with gr.Row():
-        broad_categories = gr.Dropdown(
-            choices=broad_categories,
-            label="Broad arXiv Category",
-            multiselect=True,
-            value="cs",
         )
-    with gr.Accordion("Advanced Options", open=False):
-        gr.Markdown(
-            "Narrow by arXiv categories. **Note** this will take precedence over the"
-            " broad category selection."
         )
-        narrow_categories = gr.Dropdown(
-            choices=all_possible_arxiv_categories,
-            value=None,
-            multiselect=True,
-            label="Narrow arXiv Category",
         )
-        gr.ClearButton(narrow_categories, "Clear Narrow Categories", size="sm")
-    with gr.Row():
-        new_only = gr.Checkbox(True, label="New Datasets Only", interactive=True)
-    results = gr.Markdown(create_markdown_summary())
-    broad_categories.change(
-        create_markdown_summary,
-        inputs=[broad_categories, new_only, narrow_categories],
-        outputs=results,
-    )
-    narrow_categories.change(
-        create_markdown_summary,
-        inputs=[broad_categories, new_only, narrow_categories],
-        outputs=results,
-    )
-    new_only.change(
-        create_markdown_summary,
-        [broad_categories, new_only, narrow_categories],
-        results,
     )
-demo.launch()

+"""
+FastAPI + HTMX app for browsing arxiv papers with new ML datasets.
+Downloads Lance dataset from HuggingFace Hub and loads locally.
+"""
+import re
+from datetime import date, timedelta
+from functools import lru_cache
+from typing import Optional
+from urllib.parse import urlencode
+import lance
+import polars as pl
+from cachetools import TTLCache
+from dotenv import load_dotenv
+from fastapi import FastAPI, Query, Request
+from fastapi.responses import HTMLResponse, RedirectResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
+from huggingface_hub import snapshot_download
+from markupsafe import Markup
+# Load .env file for local development (HF_TOKEN)
+load_dotenv()
+app = FastAPI(title="ArXiv New ML Datasets")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+templates = Jinja2Templates(directory="templates")
+def highlight_search(text: str, search: str) -> Markup:
+    """Highlight search terms in text with yellow background."""
+    if not search or not text:
+        return Markup(text) if text else Markup("")
+    # Escape HTML in text first
+    import html
+    text = html.escape(str(text))
+    # Case-insensitive replacement with highlight span
+    pattern = re.compile(re.escape(search), re.IGNORECASE)
+    highlighted = pattern.sub(
+        lambda m: f'<mark class="bg-yellow-200 px-0.5 rounded">{m.group()}</mark>',
+        text
+    )
+    return Markup(highlighted)
+# Register custom filter
+templates.env.filters["highlight"] = highlight_search
+# Dataset config
+DATASET_REPO = "librarian-bots/arxiv-cs-papers-lance"
+# Cache for dataset (reload every 6 hours)
+_dataset_cache: TTLCache = TTLCache(maxsize=1, ttl=60 * 60 * 6)
+# Cache for Lance dataset connection (for vector search)
+_lance_cache: dict = {}
+# Cache for embedding model (lazy loaded on first semantic search)
+_model_cache: dict = {}
+def get_lance_dataset():
+    """Download dataset from HF Hub (cached) and return Lance connection."""
+    if "ds" not in _lance_cache:
+        # Use local_dir to get actual files, not symlinks (Lance needs real files)
+        local_dir = "./data/arxiv-lance"
+        print(f"Downloading dataset from {DATASET_REPO} to {local_dir}...")
+        snapshot_download(
+            DATASET_REPO,
+            repo_type="dataset",
+            local_dir=local_dir,
         )
+        lance_path = f"{local_dir}/data/train.lance"
+        print(f"Loading Lance dataset from {lance_path}")
+        _lance_cache["ds"] = lance.dataset(lance_path)
+    return _lance_cache["ds"]
+def get_embedding_model():
+    """Load embedding model (cached, lazy-loaded on first semantic search)."""
+    if "model" not in _model_cache:
+        from sentence_transformers import SentenceTransformer
+        print("Loading embedding model...")
+        _model_cache["model"] = SentenceTransformer("BAAI/bge-base-en-v1.5")
+        print("Embedding model loaded!")
+    return _model_cache["model"]
+def get_dataframe() -> pl.DataFrame:
+    """Load Lance dataset and convert to Polars DataFrame."""
+    cache_key = "df"
+    if cache_key in _dataset_cache:
+        return _dataset_cache[cache_key]
+    ds = get_lance_dataset()  # Downloads from HF Hub if not cached
+    # Select columns needed for filtering/display (exclude embeddings for memory)
+    columns = [
+        "id", "title", "abstract", "categories", "update_date",
+        "authors", "is_new_dataset", "confidence_score"
     ]
+    arrow_table = ds.to_table(columns=columns)
+    df = pl.from_arrow(arrow_table)
+    _dataset_cache[cache_key] = df
+    print(f"Loaded {len(df):,} papers")
+    return df
+@lru_cache(maxsize=1)
+def get_categories() -> list[str]:
+    """Get unique category prefixes for filtering."""
+    df = get_dataframe()
+    # Extract primary category (before first space or as-is)
+    categories = (
+        df.select(pl.col("categories").str.split(" ").list.first().alias("cat"))
+        .unique()
+        .sort("cat")
+        .to_series()
+        .to_list()
+    )
+    # Get common ML-related categories
+    ml_cats = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.IR", "cs.RO", "stat.ML"]
+    return [c for c in ml_cats if c in categories]
+@lru_cache(maxsize=1)
+def get_confidence_counts() -> dict[str, int]:
+    """Count papers at each confidence threshold (for Tufte-style filter).
+    Thresholds chosen based on actual data distribution (avg ~70% confidence).
+    """
+    df = get_dataframe()
+    new_datasets = df.filter(pl.col("is_new_dataset"))
+    thresholds = [0.5, 0.6, 0.65, 0.7, 0.71]
+    return {
+        str(t): new_datasets.filter(pl.col("confidence_score") >= t).height
+        for t in thresholds
+    }
+@lru_cache(maxsize=1)
+def get_histogram_data() -> dict:
+    """Get confidence distribution data for histogram display.
+    Dynamically determines the range from actual data distribution.
+    Returns dict with bins and metadata. The 50% line marks the prediction boundary.
+    """
+    df = get_dataframe()
+    # Get all papers with confidence scores
+    all_papers = df.select("confidence_score", "is_new_dataset")
+    # Dynamically determine the range from actual data
+    # Round to nearest 5% for clean boundaries
+    actual_min = float(all_papers["confidence_score"].min())
+    actual_max = float(all_papers["confidence_score"].max())
+    # Round down to nearest 5% for min, round up for max
+    min_pct = max(0, (int(actual_min * 20) / 20))  # Floor to 5%
+    max_pct = min(1, ((int(actual_max * 20) + 1) / 20))  # Ceil to 5%
+    # Ensure minimum range of 25% for usability
+    if max_pct - min_pct < 0.25:
+        center = (min_pct + max_pct) / 2
+        min_pct = max(0, center - 0.125)
+        max_pct = min(1, center + 0.125)
+    # Use 25 bins for good granularity
+    num_bins = 25
+    bin_width = (max_pct - min_pct) / num_bins
+    bins = []
+    for i in range(num_bins):
+        bin_start = min_pct + i * bin_width
+        bin_end = min_pct + (i + 1) * bin_width
+        # Count papers in this bin
+        count = all_papers.filter(
+            (pl.col("confidence_score") >= bin_start) &
+            (pl.col("confidence_score") < bin_end)
+        ).height
+        # Count new_dataset papers in this bin
+        new_dataset_count = all_papers.filter(
+            (pl.col("confidence_score") >= bin_start) &
+            (pl.col("confidence_score") < bin_end) &
+            (pl.col("is_new_dataset"))
+        ).height
+        bins.append({
+            "bin_start": round(bin_start, 3),
+            "bin_end": round(bin_end, 3),
+            "bin_pct": int(bin_start * 100),
+            "count": count,
+            "new_dataset_count": new_dataset_count,
+        })
+    # Normalize counts for display (max height = 100%)
+    max_count = max(b["count"] for b in bins) if bins else 1
+    for b in bins:
+        b["height_pct"] = int((b["count"] / max_count) * 100) if max_count > 0 else 0
+        b["new_height_pct"] = int((b["new_dataset_count"] / max_count) * 100) if max_count > 0 else 0
+    # Calculate cumulative counts from each threshold
+    # (how many papers are at or above this threshold)
+    total_so_far = all_papers.height
+    for b in bins:
+        b["papers_above"] = total_so_far
+        total_so_far -= b["count"]
+    return {
+        "bins": bins,
+        "min_pct": round(min_pct, 2),
+        "max_pct": round(max_pct, 2),
+        "total_papers": all_papers.height,
+        "new_dataset_count": all_papers.filter(pl.col("is_new_dataset")).height,
+    }
+def parse_since(since: str) -> Optional[date]:
+    """Parse 'since' parameter to a date. Returns None for 'all time'."""
+    if not since:
+        return None
+    today = date.today()
+    if since == "1m":
+        return today - timedelta(days=30)
+    elif since == "6m":
+        return today - timedelta(days=180)
+    elif since == "1y":
+        return today - timedelta(days=365)
+    return None
+def filter_papers(
+    df: pl.DataFrame,
+    category: Optional[str] = None,
+    search: Optional[str] = None,
+    min_confidence: float = 0.5,
+    since: Optional[str] = None,
+) -> pl.DataFrame:
+    """Apply filters to the papers dataframe.
+    The confidence threshold controls which papers are shown:
+    - Papers with is_new_dataset=True have confidence >= 0.5
+    - Setting threshold to 0 shows all papers
+    - Setting threshold >= 0.5 effectively shows only new_dataset papers
     """
+    if min_confidence > 0:
+        df = df.filter(pl.col("confidence_score") >= min_confidence)
+    if category:
+        df = df.filter(pl.col("categories").str.contains(category))
+    if search:
+        search_lower = search.lower()
+        df = df.filter(
+            pl.col("title").str.to_lowercase().str.contains(search_lower)
+            | pl.col("abstract").str.to_lowercase().str.contains(search_lower)
+        )
+    # Date filter
+    min_date = parse_since(since)
+    if min_date:
+        df = df.filter(pl.col("update_date") >= min_date)
     return df
+def paginate_papers(
+    df: pl.DataFrame,
+    page: int = 1,
+    per_page: int = 20,
+    sort: str = "date",
+) -> tuple[pl.DataFrame, bool]:
+    """Sort and paginate papers, return (page_df, has_more).
+    Sort options:
+    - "date": By update_date desc, then confidence_score desc
+    - "relevance": Keep existing order (for semantic search similarity)
+    """
+    if sort == "date":
+        df_sorted = df.sort(
+            ["update_date", "confidence_score"], descending=[True, True]
+        )
+    else:
+        # "relevance" - keep existing order (already sorted by similarity for semantic)
+        df_sorted = df
+    start = (page - 1) * per_page
+    page_df = df_sorted.slice(start, per_page + 1)
+    has_more = len(page_df) > per_page
+    return page_df.head(per_page), has_more
+def semantic_search(
+    query: str,
+    k: int = 100,
+    category: Optional[str] = None,
+    min_confidence: float = 0.5,
+    since: Optional[str] = None,
+) -> pl.DataFrame:
+    """Search using vector similarity via Lance nearest neighbor.
+    Returns DataFrame with similarity_score column (0-1, higher is more similar).
+    """
+    model = get_embedding_model()
+    query_embedding = model.encode(query).tolist()
+    ds = get_lance_dataset()
+    # Build SQL filter (Lance supports SQL-like syntax)
+    filters = []
+    if min_confidence > 0:
+        filters.append(f"confidence_score >= {min_confidence}")
+    if category:
+        # Escape single quotes in category name for SQL safety
+        safe_category = category.replace("'", "''")
+        filters.append(f"categories LIKE '%{safe_category}%'")
+    # Date filter - use TIMESTAMP literal for Lance/DataFusion
+    min_date = parse_since(since)
+    if min_date:
+        filters.append(f"update_date >= TIMESTAMP '{min_date.isoformat()} 00:00:00'")
+    filter_str = " AND ".join(filters) if filters else None
+    # Vector search - include _distance for similarity calculation
+    results = ds.scanner(
+        nearest={"column": "embedding", "q": query_embedding, "k": k},
+        filter=filter_str,
+        columns=["id", "title", "abstract", "categories", "update_date",
+                 "authors", "confidence_score", "_distance"]
+    ).to_table()
+    df = pl.from_arrow(results)
+    # Convert L2 distance to similarity score (0-1 range)
+    # For normalized embeddings: similarity = 1 - distance/2
+    # BGE embeddings are normalized, so L2 distance ranges from 0 to 2
+    df = df.with_columns(
+        (1 - pl.col("_distance") / 2).clip(0, 1).alias("similarity_score")
+    ).drop("_distance")
+    return df
+@app.get("/", response_class=HTMLResponse)
+async def home(
+    request: Request,
+    search: Optional[str] = Query(None),
+    search_type: str = Query("keyword"),
+    category: Optional[str] = Query(None),
+    min_confidence: str = Query("0.5"),  # String to preserve exact value for template
+    since: Optional[str] = Query(None),
+    sort: str = Query("date"),
+):
+    """Render the home page with optional initial filter state from URL."""
+    df = get_dataframe()
+    categories = get_categories()
+    histogram_data = get_histogram_data()
+    # Get stats
+    total_papers = len(df)
+    new_dataset_count = df.filter(pl.col("is_new_dataset")).height
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "categories": categories,
+            "total_papers": total_papers,
+            "new_dataset_count": new_dataset_count,
+            "histogram_data": histogram_data,
+            # Pass filter state for URL persistence
+            "search": search or "",
+            "search_type": search_type,
+            "category": category or "",
+            "min_confidence": min_confidence,
+            "since": since or "",
+            "sort": sort,
+        },
     )
+@app.get("/papers", response_class=HTMLResponse)
+async def get_papers(
+    request: Request,
+    page: int = Query(1, ge=1),
+    per_page: int = Query(20, ge=1, le=100),
+    category: Optional[str] = Query(None),
+    search: Optional[str] = Query(None),
+    min_confidence: float = Query(0.5, ge=0, le=1),
+    search_type: str = Query("keyword"),  # "keyword" or "semantic"
+    sort: str = Query("date"),  # "date" or "relevance"
+    since: Optional[str] = Query(None),  # "1m", "6m", "1y", or None for all
+):
+    """Get paginated and filtered papers (returns HTML partial for HTMX).
+    If accessed directly (not via HTMX), redirects to home page with same params.
+    """
+    # Redirect direct browser visits to home page (this endpoint returns partials)
+    if "HX-Request" not in request.headers:
+        # Build redirect URL with current query params
+        query_string = str(request.url.query)
+        redirect_url = f"/?{query_string}" if query_string else "/"
+        return RedirectResponse(url=redirect_url, status_code=302)
+    if search and search_type == "semantic":
+        # Vector search - returns pre-sorted by similarity
+        filtered_df = semantic_search(
+            query=search,
+            k=per_page * 5,  # Get more for pagination buffer
+            category=category,
+            min_confidence=min_confidence,
+            since=since,
         )
+        # Default to relevance sort for semantic, but allow date sort
+        effective_sort = sort if sort == "date" else "relevance"
+        page_df, has_more = paginate_papers(
+            filtered_df, page=page, per_page=per_page, sort=effective_sort
         )
+    else:
+        # Existing keyword search path
+        df = get_dataframe()
+        filtered_df = filter_papers(
+            df,
+            category=category,
+            search=search,
+            min_confidence=min_confidence,
+            since=since,
         )
+        # Keyword search always sorts by date
+        page_df, has_more = paginate_papers(
+            filtered_df, page=page, per_page=per_page, sort="date"
+        )
+    # Convert to list of dicts for template
+    papers = page_df.to_dicts()
+    # Build clean URL for browser history (/ instead of /papers)
+    # Only include non-default values to keep URLs short
+    params = {}
+    if search:
+        params["search"] = search
+    if search_type != "keyword":
+        params["search_type"] = search_type
+    if category:
+        params["category"] = category
+    if min_confidence != 0.5:
+        params["min_confidence"] = min_confidence
+    if since:
+        params["since"] = since
+    if sort != "date":
+        params["sort"] = sort
+    push_url = "/?" + urlencode(params) if params else "/"
+    response = templates.TemplateResponse(
+        "partials/paper_list.html",
+        {
+            "request": request,
+            "papers": papers,
+            "page": page,
+            "has_more": has_more,
+            "category": category or "",
+            "search": search or "",
+            "min_confidence": min_confidence,
+            "search_type": search_type,
+            "sort": sort,
+            "since": since or "",
+            "total_filtered": len(filtered_df),
+        },
     )
+    # Tell HTMX to push clean URL (/ not /papers)
+    response.headers["HX-Push-Url"] = push_url
+    return response
+@app.get("/api/stats")
+async def get_stats():
+    """Get dataset statistics as JSON."""
+    df = get_dataframe()
+    new_datasets = df.filter(pl.col("is_new_dataset"))
+    return {
+        "total_papers": len(df),
+        "new_dataset_count": len(new_datasets),
+        "avg_confidence": float(df["confidence_score"].mean()),
+        "date_range": {
+            "min": str(df["update_date"].min()),
+            "max": str(df["update_date"].max()),
+        },
+    }
+# Preload dataset and model on startup
+@app.on_event("startup")
+async def startup_event():
+    """Preload dataset and embedding model on startup."""
+    print("Preloading dataset...")
+    get_dataframe()
+    print("Dataset loaded!")
+    print("Preloading embedding model...")
+    get_embedding_model()
+    print("Embedding model loaded!")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -1,354 +1,11 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile requirements.in -o requirements.txt
-aiofiles==23.2.1
-    # via gradio
-aiohttp==3.9.5
-    # via
-    #   datasets
-    #   fsspec
-aiosignal==1.3.1
-    # via aiohttp
-altair==5.3.0
-    # via gradio
-annotated-types==0.7.0
-    # via pydantic
-anyio==4.4.0
-    # via
-    #   httpx
-    #   starlette
-    #   watchfiles
-apscheduler==3.10.4
-    # via -r requirements.in
-arxiv==2.1.0
-    # via -r requirements.in
-attrs==23.2.0
-    # via
-    #   aiohttp
-    #   jsonschema
-    #   referencing
-cachetools==5.3.3
-    # via -r requirements.in
-certifi==2024.6.2
-    # via
-    #   httpcore
-    #   httpx
-    #   requests
-charset-normalizer==3.3.2
-    # via requests
-click==8.1.7
-    # via
-    #   typer
-    #   uvicorn
-contourpy==1.2.1
-    # via matplotlib
-cycler==0.12.1
-    # via matplotlib
-datasets==2.14.4
-    # via
-    #   evaluate
-    #   setfit
-dill==0.3.7
-    # via
-    #   datasets
-    #   evaluate
-    #   multiprocess
-dnspython==2.6.1
-    # via email-validator
-email-validator==2.1.1
-    # via fastapi
-evaluate==0.4.2
-    # via setfit
-fastapi==0.111.0
-    # via gradio
-fastapi-cli==0.0.4
-    # via fastapi
-feedparser==6.0.10
-    # via arxiv
-ffmpy==0.3.2
-    # via gradio
-filelock==3.14.0
-    # via
-    #   huggingface-hub
-    #   torch
-    #   transformers
-fonttools==4.53.0
-    # via matplotlib
-frozenlist==1.4.1
-    # via
-    #   aiohttp
-    #   aiosignal
-fsspec==2024.6.0
-    # via
-    #   datasets
-    #   evaluate
-    #   gradio-client
-    #   huggingface-hub
-    #   torch
-gradio==4.36.1
-    # via -r requirements.in
-gradio-client==1.0.1
-    # via gradio
-h11==0.14.0
-    # via
-    #   httpcore
-    #   uvicorn
-hf-transfer==0.1.6
-    # via -r requirements.in
-httpcore==1.0.5
-    # via httpx
-httptools==0.6.1
-    # via uvicorn
-httpx==0.27.0
-    # via
-    #   fastapi
-    #   gradio
-    #   gradio-client
-huggingface-hub==0.23.3
-    # via
-    #   datasets
-    #   evaluate
-    #   gradio
-    #   gradio-client
-    #   sentence-transformers
-    #   setfit
-    #   tokenizers
-    #   transformers
-idna==3.7
-    # via
-    #   anyio
-    #   email-validator
-    #   httpx
-    #   requests
-    #   yarl
-importlib-resources==6.4.0
-    # via gradio
-jinja2==3.1.4
-    # via
-    #   altair
-    #   fastapi
-    #   gradio
-    #   torch
-joblib==1.4.2
-    # via scikit-learn
-jsonschema==4.22.0
-    # via altair
-jsonschema-specifications==2023.12.1
-    # via jsonschema
-kiwisolver==1.4.5
-    # via matplotlib
-markdown-it-py==3.0.0
-    # via rich
-markupsafe==2.1.5
-    # via
-    #   gradio
-    #   jinja2
-matplotlib==3.9.0
-    # via gradio
-mdurl==0.1.2
-    # via markdown-it-py
-mpmath==1.3.0
-    # via sympy
-multidict==6.0.5
-    # via
-    #   aiohttp
-    #   yarl
-multiprocess==0.70.15
-    # via
-    #   datasets
-    #   evaluate
-networkx==3.3
-    # via torch
-numpy==1.26.4
-    # via
-    #   altair
-    #   contourpy
-    #   datasets
-    #   evaluate
-    #   gradio
-    #   matplotlib
-    #   pandas
-    #   pyarrow
-    #   scikit-learn
-    #   scipy
-    #   sentence-transformers
-    #   transformers
-orjson==3.10.3
-    # via
-    #   fastapi
-    #   gradio
-packaging==24.1
-    # via
-    #   altair
-    #   datasets
-    #   evaluate
-    #   gradio
-    #   gradio-client
-    #   huggingface-hub
-    #   matplotlib
-    #   setfit
-    #   transformers
-pandas==2.2.2
-    # via
-    #   altair
-    #   datasets
-    #   evaluate
-    #   gradio
-pillow==10.3.0
-    # via
-    #   gradio
-    #   matplotlib
-    #   sentence-transformers
-pyarrow==16.1.0
-    # via datasets
-pydantic==2.7.3
-    # via
-    #   fastapi
-    #   gradio
-pydantic-core==2.18.4
-    # via pydantic
-pydub==0.25.1
-    # via gradio
-pygments==2.18.0
-    # via rich
-pyparsing==3.1.2
-    # via matplotlib
-python-dateutil==2.9.0.post0
-    # via
-    #   matplotlib
-    #   pandas
-python-dotenv==1.0.1
-    # via uvicorn
-python-multipart==0.0.9
-    # via
-    #   fastapi
-    #   gradio
-pytz==2024.1
-    # via
-    #   apscheduler
-    #   pandas
-pyyaml==6.0.1
-    # via
-    #   datasets
-    #   gradio
-    #   huggingface-hub
-    #   transformers
-    #   uvicorn
-referencing==0.35.1
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
-regex==2024.5.15
-    # via transformers
-requests==2.31.0
-    # via
-    #   arxiv
-    #   datasets
-    #   evaluate
-    #   huggingface-hub
-    #   transformers
-rich==13.7.1
-    # via typer
-rpds-py==0.18.1
-    # via
-    #   jsonschema
-    #   referencing
-ruff==0.4.8
-    # via gradio
-safetensors==0.4.3
-    # via transformers
-scikit-learn==1.2.2
-    # via
-    #   -r requirements.in
-    #   sentence-transformers
-    #   setfit
-scipy==1.13.1
-    # via
-    #   scikit-learn
-    #   sentence-transformers
-semantic-version==2.10.0
-    # via gradio
-sentence-transformers==3.0.1
-    # via setfit
-setfit==1.0.3
-    # via -r requirements.in
-sgmllib3k==1.0.0
-    # via feedparser
-shellingham==1.5.4
-    # via typer
-six==1.16.0
-    # via
-    #   apscheduler
-    #   python-dateutil
-sniffio==1.3.1
-    # via
-    #   anyio
-    #   httpx
-stamina==24.2.0
-    # via -r requirements.in
-starlette==0.37.2
-    # via fastapi
-sympy==1.12.1
-    # via torch
-tenacity==8.3.0
-    # via stamina
-threadpoolctl==3.5.0
-    # via scikit-learn
-tokenizers==0.19.1
-    # via transformers
-tomlkit==0.12.0
-    # via gradio
-toolz==0.12.1
-    # via altair
-torch==2.3.1
-    # via sentence-transformers
-tqdm==4.66.4
-    # via
-    #   datasets
-    #   evaluate
-    #   huggingface-hub
-    #   sentence-transformers
-    #   transformers
-transformers==4.41.2
-    # via sentence-transformers
-typer==0.12.3
-    # via
-    #   fastapi-cli
-    #   gradio
-typing-extensions==4.12.2
-    # via
-    #   fastapi
-    #   gradio
-    #   gradio-client
-    #   huggingface-hub
-    #   pydantic
-    #   pydantic-core
-    #   torch
-    #   typer
-tzdata==2024.1
-    # via pandas
-tzlocal==5.2
-    # via apscheduler
-ujson==5.10.0
-    # via fastapi
-urllib3==2.2.1
-    # via
-    #   gradio
-    #   requests
-uvicorn==0.30.1
-    # via
-    #   fastapi
-    #   gradio
-uvloop==0.19.0
-    # via uvicorn
-watchfiles==0.22.0
-    # via uvicorn
-websockets==11.0.3
-    # via
-    #   gradio-client
-    #   uvicorn
-xxhash==3.4.1
-    # via
-    #   datasets
-    #   evaluate
-yarl==1.9.4
-    # via aiohttp

+fastapi
+uvicorn[standard]
+jinja2
+markupsafe
+polars
+huggingface-hub[hf_transfer]
+python-dotenv
+cachetools
+pyarrow
+pylance>=0.20
+sentence-transformers>=3.0

static/styles.css ADDED Viewed

	@@ -0,0 +1,12 @@

+/* Custom styles - most styling is via Tailwind CDN */
+/* Ensure smooth scrolling */
+html {
+    scroll-behavior: smooth;
+}
+/* Better focus styles */
+:focus-visible {
+    outline: 2px solid #3b82f6;
+    outline-offset: 2px;
+}

templates/base.html ADDED Viewed

	@@ -0,0 +1,44 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{% block title %}Dataset Papers on ArXiv{% endblock %}</title>
+    <!-- Tailwind CSS -->
+    <script src="https://cdn.tailwindcss.com"></script>
+    <!-- HTMX -->
+    <script src="https://unpkg.com/htmx.org@1.9.12"></script>
+    <style>
+        /* Loading indicator - subtle */
+        .htmx-indicator { display: none; }
+        .htmx-request .htmx-indicator,
+        .htmx-request.htmx-indicator { display: inline; }
+        /* Content fades during load */
+        .htmx-request #paper-list { opacity: 0.5; transition: opacity 0.15s; }
+    </style>
+</head>
+<body class="bg-white min-h-screen text-gray-900">
+    <header class="border-b border-gray-200">
+        <div class="max-w-3xl mx-auto px-4 py-6">
+            <h1 class="text-xl font-semibold">Dataset Papers on ArXiv</h1>
+            <p class="text-sm text-gray-500 mt-1">CS papers predicted to introduce new ML datasets by <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="inline-flex items-center gap-1 text-gray-700 hover:text-blue-600 font-medium">this model <span class="text-base">🤗</span></a></p>
+        </div>
+    </header>
+    <main class="max-w-3xl mx-auto px-4 py-6">
+        {% block content %}{% endblock %}
+    </main>
+    <footer class="border-t border-gray-100 mt-12">
+        <div class="max-w-3xl mx-auto px-4 py-4 text-gray-400 text-xs">
+            <a href="https://huggingface.co/datasets/davanstrien/my-classified-papers" class="hover:text-gray-600">Data source</a>
+            <span class="mx-2">·</span>
+            <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="hover:text-gray-600">Model</a>
+        </div>
+    </footer>
+</body>
+</html>

templates/index.html ADDED Viewed

	@@ -0,0 +1,137 @@

+{% extends "base.html" %}
+{% block content %}
+<div>
+    <!-- Powered by -->
+    <div class="text-xs text-gray-400 mb-4">
+        Vector search powered by <a href="https://lancedb.github.io/lance/" class="underline hover:text-gray-600">Lance</a>
+        &middot; Updated weekly via <a href="https://huggingface.co/docs/hub/en/spaces-run-with-hf-jobs" class="underline hover:text-gray-600">HF Jobs</a>
+        &middot; <a href="https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance" class="underline hover:text-gray-600">Dataset</a>
+    </div>
+    <!-- Stats - minimal -->
+    <div class="flex items-baseline gap-2 mb-6">
+        <span class="text-3xl font-semibold text-gray-900">{{ "{:,}".format(new_dataset_count) }}</span>
+        <span class="text-gray-500">papers with new datasets</span>
+        <span class="text-gray-400 text-sm ml-auto">from {{ "{:,}".format(total_papers) }} total</span>
+    </div>
+    <!-- Filters - sticky on scroll -->
+    <div class="sticky top-0 z-10 bg-white flex flex-wrap items-center gap-4 py-4 border-b border-gray-200 mb-6">
+        <!-- Search -->
+        <input type="search"
+               name="search"
+               id="search-input"
+               placeholder="Search..."
+               value="{{ search }}"
+               class="flex-1 min-w-48 px-3 py-2 border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none"
+               hx-get="/papers"
+               hx-trigger="input changed delay:500ms, keyup[key=='Enter'], histogramChange"
+               hx-target="#paper-list"
+               hx-include="#filter-form, #category-select, #confidence-filter, #since-filter, #sort-select, #search-type-toggle"
+               hx-indicator="#loading-indicator"
+               hx-push-url="true">
+        <!-- Search mode toggle -->
+        <div id="search-type-toggle" class="flex items-center gap-2 text-xs text-gray-500">
+            <label class="flex items-center gap-1 cursor-pointer">
+                <input type="radio" name="search_type" value="keyword" {% if search_type == 'keyword' %}checked{% endif %}
+                       class="h-3 w-3"
+                       hx-get="/papers"
+                       hx-trigger="change"
+                       hx-target="#paper-list"
+                       hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #sort-select"
+                       hx-indicator="#loading-indicator"
+                       hx-push-url="true">
+                <span>Keyword</span>
+            </label>
+            <label class="flex items-center gap-1 cursor-pointer">
+                <input type="radio" name="search_type" value="semantic" {% if search_type == 'semantic' %}checked{% endif %}
+                       class="h-3 w-3"
+                       hx-get="/papers"
+                       hx-trigger="change"
+                       hx-target="#paper-list"
+                       hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #sort-select"
+                       hx-indicator="#loading-indicator"
+                       hx-push-url="true">
+                <span>Semantic</span>
+            </label>
+        </div>
+        <!-- Category filter -->
+        <select name="category"
+                id="category-select"
+                class="px-3 py-2 border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-700"
+                hx-get="/papers"
+                hx-trigger="change"
+                hx-target="#paper-list"
+                hx-include="#filter-form, #search-input, #confidence-filter, #since-filter, #sort-select, #search-type-toggle"
+                hx-indicator="#loading-indicator"
+                hx-push-url="true">
+            <option value="">All categories</option>
+            {% for cat in categories %}
+            <option value="{{ cat }}" {% if category == cat %}selected{% endif %}>{{ cat }}</option>
+            {% endfor %}
+        </select>
+        <!-- Confidence filter dropdown -->
+        <select name="min_confidence"
+                id="confidence-filter"
+                class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500 ml-auto"
+                hx-get="/papers"
+                hx-trigger="change"
+                hx-target="#paper-list"
+                hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
+                hx-indicator="#loading-indicator"
+                hx-push-url="true">
+            <option value="0.5" {% if min_confidence == '0.5' %}selected{% endif %}>New datasets only</option>
+            <option value="0.6" {% if min_confidence == '0.6' %}selected{% endif %}>Higher confidence</option>
+            <option value="0" {% if min_confidence == '0' %}selected{% endif %}>All papers</option>
+        </select>
+        <!-- Since filter dropdown -->
+        <select name="since"
+                id="since-filter"
+                class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500"
+                hx-get="/papers"
+                hx-trigger="change"
+                hx-target="#paper-list"
+                hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #sort-select, #search-type-toggle"
+                hx-indicator="#loading-indicator"
+                hx-push-url="true">
+            <option value="" {% if not since %}selected{% endif %}>All time</option>
+            <option value="1m" {% if since == '1m' %}selected{% endif %}>Past month</option>
+            <option value="6m" {% if since == '6m' %}selected{% endif %}>Past 6 months</option>
+            <option value="1y" {% if since == '1y' %}selected{% endif %}>Past year</option>
+        </select>
+        <!-- Sort dropdown -->
+        <select name="sort"
+                id="sort-select"
+                class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500"
+                hx-get="/papers"
+                hx-trigger="change"
+                hx-target="#paper-list"
+                hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #search-type-toggle"
+                hx-indicator="#loading-indicator"
+                hx-push-url="true">
+            <option value="date" {% if sort == 'date' %}selected{% endif %}>Newest first</option>
+            <option value="relevance" {% if sort == 'relevance' %}selected{% endif %}>Relevance</option>
+        </select>
+        <!-- Loading indicator - subtle -->
+        <span id="loading-indicator" class="htmx-indicator text-sm text-gray-400">Loading...</span>
+        <!-- Hidden form for hx-include -->
+        <form id="filter-form" class="hidden"></form>
+    </div>
+    <!-- Paper list -->
+    <div id="paper-list"
+         hx-get="/papers?{% if search %}search={{ search|urlencode }}&{% endif %}search_type={{ search_type }}&{% if category %}category={{ category|urlencode }}&{% endif %}min_confidence={{ min_confidence }}&{% if since %}since={{ since }}&{% endif %}sort={{ sort }}"
+         hx-trigger="load"
+         hx-indicator="#loading-indicator">
+        <div class="py-8 text-gray-400 text-sm">Loading papers...</div>
+    </div>
+</div>
+{% endblock %}

templates/partials/paper_card.html ADDED Viewed

	@@ -0,0 +1,70 @@

+<article class="py-5 border-b border-gray-200">
+    <!-- Title with paper icon -->
+    <h3 class="text-lg font-semibold text-gray-900 leading-tight">
+        <a href="https://huggingface.co/papers/{{ paper.id }}"
+           target="_blank"
+           class="hover:text-blue-600 inline-flex items-start gap-2 group">
+            <svg class="w-5 h-5 mt-0.5 text-gray-400 group-hover:text-blue-500 flex-shrink-0" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                <path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"></path>
+            </svg>
+            <span>{% if search %}{{ paper.title|highlight(search) }}{% else %}{{ paper.title }}{% endif %}</span>
+        </a>
+    </h3>
+    <!-- Meta info - inline with category badge -->
+    {% set category = paper.categories.split(' ')[0] if paper.categories else '' %}
+    {% set cat_colors = {
+        'cs.CV': 'bg-purple-100 text-purple-700',
+        'cs.AI': 'bg-blue-100 text-blue-700',
+        'cs.LG': 'bg-green-100 text-green-700',
+        'cs.CL': 'bg-orange-100 text-orange-700',
+        'cs.NE': 'bg-pink-100 text-pink-700'
+    } %}
+    {% set badge_class = cat_colors.get(category, 'bg-gray-100 text-gray-600') %}
+    <div class="mt-2 flex items-center gap-3 text-sm text-gray-500">
+        <span class="px-2 py-0.5 rounded-full text-xs font-medium {{ badge_class }}">{{ category }}</span>
+        <span>{{ paper.update_date.strftime('%Y-%m-%d') if paper.update_date else 'Unknown' }}</span>
+        {% if search_type == 'semantic' and paper.similarity_score is defined %}
+        <span class="text-blue-500 inline-flex items-center gap-1">
+            {{ "%.0f"|format(paper.similarity_score * 100) }}% match
+            <span class="cursor-help" title="How similar this paper is to your search query">
+                <svg class="w-3.5 h-3.5 text-blue-400" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                    <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
+                    <path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
+                    <circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
+                </svg>
+            </span>
+        </span>
+        <span class="text-gray-400 inline-flex items-center gap-1">
+            {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
+            <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
+                <svg class="w-3.5 h-3.5 text-gray-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                    <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
+                    <path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
+                    <circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
+                </svg>
+            </span>
+        </span>
+        {% else %}
+        <span class="{% if paper.confidence_score < 0.8 %}text-gray-400{% else %}text-gray-500{% endif %} inline-flex items-center gap-1">
+            {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
+            <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
+                <svg class="w-3.5 h-3.5 {% if paper.confidence_score < 0.8 %}text-gray-300{% else %}text-gray-400{% endif %}" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                    <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
+                    <path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
+                    <circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
+                </svg>
+            </span>
+        </span>
+        {% endif %}
+    </div>
+    <!-- Abstract (truncated) -->
+    <p class="mt-2 text-gray-600 text-sm leading-relaxed">
+        {% if search %}
+            {{ paper.abstract[:400]|highlight(search) }}{% if paper.abstract|length > 400 %}...{% endif %}
+        {% else %}
+            {{ paper.abstract[:400] }}{% if paper.abstract|length > 400 %}...{% endif %}
+        {% endif %}
+    </p>
+</article>

templates/partials/paper_list.html ADDED Viewed

	@@ -0,0 +1,33 @@

+<!-- Paper count - subtle -->
+<div class="text-xs text-gray-400 mb-4">
+    {{ "{:,}".format(total_filtered) }} results{% if search %} for "{{ search }}"{% if search_type == 'semantic' %} <span class="text-blue-400">(semantic)</span>{% endif %}{% endif %}{% if category %} in {{ category }}{% endif %}
+</div>
+<!-- Paper cards -->
+<div>
+    {% for paper in papers %}
+    {% include "partials/paper_card.html" %}
+    {% endfor %}
+</div>
+{% if papers|length == 0 %}
+<div class="py-12 text-gray-400 text-sm">
+    No papers found. Try adjusting your filters.
+</div>
+{% endif %}
+<!-- Infinite scroll trigger -->
+{% if has_more %}
+<div hx-get="/papers?page={{ page + 1 }}&category={{ category }}&search={{ search }}&min_confidence={{ min_confidence }}&search_type={{ search_type }}&sort={{ sort }}&since={{ since }}"
+     hx-trigger="revealed"
+     hx-swap="outerHTML"
+     class="py-6 text-center text-xs text-gray-400">
+    <noscript>
+        <a href="/papers?page={{ page + 1 }}" class="hover:text-gray-600">Load more</a>
+    </noscript>
+</div>
+{% else %}
+<div class="py-6 text-center text-xs text-gray-300">
+    End of results
+</div>
+{% endif %}