Upload folder using huggingface_hub
Browse files- .gitignore +2 -0
- Dockerfile +26 -0
- README.md +29 -9
- app.py +484 -177
- requirements.txt +11 -354
- static/styles.css +12 -0
- templates/base.html +44 -0
- templates/index.html +137 -0
- templates/partials/paper_card.html +70 -0
- templates/partials/paper_list.html +33 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
data/
|
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Enable HF transfer for faster downloads
|
| 6 |
+
ENV HF_HUB_ENABLE_HF_TRANSFER=1
|
| 7 |
+
|
| 8 |
+
# Install uv for fast dependency management
|
| 9 |
+
RUN pip install --no-cache-dir uv
|
| 10 |
+
|
| 11 |
+
# Copy and install dependencies
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN uv pip install --system --no-cache -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy application
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
# Create non-root user for security
|
| 19 |
+
RUN useradd -m -u 1000 user
|
| 20 |
+
USER user
|
| 21 |
+
|
| 22 |
+
# HF Spaces expects port 7860
|
| 23 |
+
ENV PORT=7860
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,13 +1,33 @@
|
|
| 1 |
---
|
| 2 |
-
title: New
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version: 4.36.1
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ArXiv New ML Datasets
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# ArXiv New ML Datasets
|
| 12 |
+
|
| 13 |
+
Browse **1.1M+ CS papers** from arXiv, with **50,000+ classified** as introducing new machine learning datasets.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- **Keyword search** - Search titles and abstracts
|
| 18 |
+
- **Semantic search** - Find conceptually similar papers using vector embeddings
|
| 19 |
+
- **Filter** by arXiv category (cs.AI, cs.CV, cs.LG, etc.)
|
| 20 |
+
- **Infinite scroll** for smooth browsing
|
| 21 |
+
- Links to arXiv, PDF, and HF Papers
|
| 22 |
+
|
| 23 |
+
## Data Source
|
| 24 |
+
|
| 25 |
+
Papers classified using [ModernBERT](https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset). Embeddings from [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5).
|
| 26 |
+
|
| 27 |
+
Data from [librarian-bots/arxiv-cs-papers-lance](https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance). Updated weekly.
|
| 28 |
+
|
| 29 |
+
## Tech Stack
|
| 30 |
+
|
| 31 |
+
- **Backend**: FastAPI + Polars + Lance
|
| 32 |
+
- **Frontend**: HTMX + Tailwind CSS
|
| 33 |
+
- **Vector Search**: Lance with IVF_PQ index
|
app.py
CHANGED
|
@@ -1,205 +1,512 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
-
from setfit import SetFitModel
|
| 9 |
-
from tqdm.auto import tqdm
|
| 10 |
-
import stamina
|
| 11 |
-
from arxiv import UnexpectedEmptyPageError, ArxivError
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
client = arxiv.Client(page_size=50, delay_seconds=3, num_retries=2)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
]
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
#
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
return
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
for
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
@cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
|
| 110 |
-
def prepare_data():
|
| 111 |
-
print("Downloading arxiv results...")
|
| 112 |
-
arxiv_results = get_arxiv_result()
|
| 113 |
-
print("loading model...")
|
| 114 |
-
model = load_model()
|
| 115 |
-
print("Making predictions...")
|
| 116 |
-
predictions = get_predictions(arxiv_results, model=model)
|
| 117 |
-
df = pd.DataFrame(predictions)
|
| 118 |
-
df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
|
| 119 |
-
df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
|
| 120 |
-
df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
|
| 121 |
return df
|
| 122 |
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
)
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
)
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
)
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
)
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
[
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI + HTMX app for browsing arxiv papers with new ML datasets.
|
| 3 |
+
Downloads Lance dataset from HuggingFace Hub and loads locally.
|
| 4 |
+
"""
|
| 5 |
|
| 6 |
+
import re
|
| 7 |
+
from datetime import date, timedelta
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
from typing import Optional
|
| 10 |
+
from urllib.parse import urlencode
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
import lance
|
| 13 |
+
import polars as pl
|
| 14 |
+
from cachetools import TTLCache
|
| 15 |
+
from dotenv import load_dotenv
|
| 16 |
+
from fastapi import FastAPI, Query, Request
|
| 17 |
+
from fastapi.responses import HTMLResponse, RedirectResponse
|
| 18 |
+
from fastapi.staticfiles import StaticFiles
|
| 19 |
+
from fastapi.templating import Jinja2Templates
|
| 20 |
+
from huggingface_hub import snapshot_download
|
| 21 |
+
from markupsafe import Markup
|
| 22 |
|
| 23 |
+
# Load .env file for local development (HF_TOKEN)
|
| 24 |
+
load_dotenv()
|
| 25 |
|
| 26 |
+
app = FastAPI(title="ArXiv New ML Datasets")
|
| 27 |
+
app.mount("/static", StaticFiles(directory="static"), name="static")
|
| 28 |
+
templates = Jinja2Templates(directory="templates")
|
| 29 |
|
|
|
|
| 30 |
|
| 31 |
+
def highlight_search(text: str, search: str) -> Markup:
|
| 32 |
+
"""Highlight search terms in text with yellow background."""
|
| 33 |
+
if not search or not text:
|
| 34 |
+
return Markup(text) if text else Markup("")
|
| 35 |
|
| 36 |
+
# Escape HTML in text first
|
| 37 |
+
import html
|
| 38 |
+
text = html.escape(str(text))
|
| 39 |
|
| 40 |
+
# Case-insensitive replacement with highlight span
|
| 41 |
+
pattern = re.compile(re.escape(search), re.IGNORECASE)
|
| 42 |
+
highlighted = pattern.sub(
|
| 43 |
+
lambda m: f'<mark class="bg-yellow-200 px-0.5 rounded">{m.group()}</mark>',
|
| 44 |
+
text
|
| 45 |
+
)
|
| 46 |
+
return Markup(highlighted)
|
| 47 |
|
| 48 |
+
|
| 49 |
+
# Register custom filter
|
| 50 |
+
templates.env.filters["highlight"] = highlight_search
|
| 51 |
+
|
| 52 |
+
# Dataset config
|
| 53 |
+
DATASET_REPO = "librarian-bots/arxiv-cs-papers-lance"
|
| 54 |
+
|
| 55 |
+
# Cache for dataset (reload every 6 hours)
|
| 56 |
+
_dataset_cache: TTLCache = TTLCache(maxsize=1, ttl=60 * 60 * 6)
|
| 57 |
+
|
| 58 |
+
# Cache for Lance dataset connection (for vector search)
|
| 59 |
+
_lance_cache: dict = {}
|
| 60 |
+
|
| 61 |
+
# Cache for embedding model (lazy loaded on first semantic search)
|
| 62 |
+
_model_cache: dict = {}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def get_lance_dataset():
|
| 66 |
+
"""Download dataset from HF Hub (cached) and return Lance connection."""
|
| 67 |
+
if "ds" not in _lance_cache:
|
| 68 |
+
# Use local_dir to get actual files, not symlinks (Lance needs real files)
|
| 69 |
+
local_dir = "./data/arxiv-lance"
|
| 70 |
+
print(f"Downloading dataset from {DATASET_REPO} to {local_dir}...")
|
| 71 |
+
snapshot_download(
|
| 72 |
+
DATASET_REPO,
|
| 73 |
+
repo_type="dataset",
|
| 74 |
+
local_dir=local_dir,
|
| 75 |
)
|
| 76 |
+
lance_path = f"{local_dir}/data/train.lance"
|
| 77 |
+
print(f"Loading Lance dataset from {lance_path}")
|
| 78 |
+
_lance_cache["ds"] = lance.dataset(lance_path)
|
| 79 |
+
return _lance_cache["ds"]
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_embedding_model():
|
| 83 |
+
"""Load embedding model (cached, lazy-loaded on first semantic search)."""
|
| 84 |
+
if "model" not in _model_cache:
|
| 85 |
+
from sentence_transformers import SentenceTransformer
|
| 86 |
+
print("Loading embedding model...")
|
| 87 |
+
_model_cache["model"] = SentenceTransformer("BAAI/bge-base-en-v1.5")
|
| 88 |
+
print("Embedding model loaded!")
|
| 89 |
+
return _model_cache["model"]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def get_dataframe() -> pl.DataFrame:
|
| 93 |
+
"""Load Lance dataset and convert to Polars DataFrame."""
|
| 94 |
+
cache_key = "df"
|
| 95 |
+
if cache_key in _dataset_cache:
|
| 96 |
+
return _dataset_cache[cache_key]
|
| 97 |
+
|
| 98 |
+
ds = get_lance_dataset() # Downloads from HF Hub if not cached
|
| 99 |
+
# Select columns needed for filtering/display (exclude embeddings for memory)
|
| 100 |
+
columns = [
|
| 101 |
+
"id", "title", "abstract", "categories", "update_date",
|
| 102 |
+
"authors", "is_new_dataset", "confidence_score"
|
| 103 |
]
|
| 104 |
+
arrow_table = ds.to_table(columns=columns)
|
| 105 |
+
df = pl.from_arrow(arrow_table)
|
| 106 |
+
_dataset_cache[cache_key] = df
|
| 107 |
+
print(f"Loaded {len(df):,} papers")
|
| 108 |
+
return df
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@lru_cache(maxsize=1)
|
| 112 |
+
def get_categories() -> list[str]:
|
| 113 |
+
"""Get unique category prefixes for filtering."""
|
| 114 |
+
df = get_dataframe()
|
| 115 |
+
# Extract primary category (before first space or as-is)
|
| 116 |
+
categories = (
|
| 117 |
+
df.select(pl.col("categories").str.split(" ").list.first().alias("cat"))
|
| 118 |
+
.unique()
|
| 119 |
+
.sort("cat")
|
| 120 |
+
.to_series()
|
| 121 |
+
.to_list()
|
| 122 |
+
)
|
| 123 |
+
# Get common ML-related categories
|
| 124 |
+
ml_cats = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.IR", "cs.RO", "stat.ML"]
|
| 125 |
+
return [c for c in ml_cats if c in categories]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@lru_cache(maxsize=1)
|
| 129 |
+
def get_confidence_counts() -> dict[str, int]:
|
| 130 |
+
"""Count papers at each confidence threshold (for Tufte-style filter).
|
| 131 |
+
|
| 132 |
+
Thresholds chosen based on actual data distribution (avg ~70% confidence).
|
| 133 |
+
"""
|
| 134 |
+
df = get_dataframe()
|
| 135 |
+
new_datasets = df.filter(pl.col("is_new_dataset"))
|
| 136 |
+
thresholds = [0.5, 0.6, 0.65, 0.7, 0.71]
|
| 137 |
+
return {
|
| 138 |
+
str(t): new_datasets.filter(pl.col("confidence_score") >= t).height
|
| 139 |
+
for t in thresholds
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@lru_cache(maxsize=1)
|
| 144 |
+
def get_histogram_data() -> dict:
|
| 145 |
+
"""Get confidence distribution data for histogram display.
|
| 146 |
+
|
| 147 |
+
Dynamically determines the range from actual data distribution.
|
| 148 |
+
Returns dict with bins and metadata. The 50% line marks the prediction boundary.
|
| 149 |
+
"""
|
| 150 |
+
df = get_dataframe()
|
| 151 |
+
|
| 152 |
+
# Get all papers with confidence scores
|
| 153 |
+
all_papers = df.select("confidence_score", "is_new_dataset")
|
| 154 |
+
|
| 155 |
+
# Dynamically determine the range from actual data
|
| 156 |
+
# Round to nearest 5% for clean boundaries
|
| 157 |
+
actual_min = float(all_papers["confidence_score"].min())
|
| 158 |
+
actual_max = float(all_papers["confidence_score"].max())
|
| 159 |
+
|
| 160 |
+
# Round down to nearest 5% for min, round up for max
|
| 161 |
+
min_pct = max(0, (int(actual_min * 20) / 20)) # Floor to 5%
|
| 162 |
+
max_pct = min(1, ((int(actual_max * 20) + 1) / 20)) # Ceil to 5%
|
| 163 |
+
|
| 164 |
+
# Ensure minimum range of 25% for usability
|
| 165 |
+
if max_pct - min_pct < 0.25:
|
| 166 |
+
center = (min_pct + max_pct) / 2
|
| 167 |
+
min_pct = max(0, center - 0.125)
|
| 168 |
+
max_pct = min(1, center + 0.125)
|
| 169 |
+
|
| 170 |
+
# Use 25 bins for good granularity
|
| 171 |
+
num_bins = 25
|
| 172 |
+
bin_width = (max_pct - min_pct) / num_bins
|
| 173 |
+
|
| 174 |
+
bins = []
|
| 175 |
+
for i in range(num_bins):
|
| 176 |
+
bin_start = min_pct + i * bin_width
|
| 177 |
+
bin_end = min_pct + (i + 1) * bin_width
|
| 178 |
+
|
| 179 |
+
# Count papers in this bin
|
| 180 |
+
count = all_papers.filter(
|
| 181 |
+
(pl.col("confidence_score") >= bin_start) &
|
| 182 |
+
(pl.col("confidence_score") < bin_end)
|
| 183 |
+
).height
|
| 184 |
+
|
| 185 |
+
# Count new_dataset papers in this bin
|
| 186 |
+
new_dataset_count = all_papers.filter(
|
| 187 |
+
(pl.col("confidence_score") >= bin_start) &
|
| 188 |
+
(pl.col("confidence_score") < bin_end) &
|
| 189 |
+
(pl.col("is_new_dataset"))
|
| 190 |
+
).height
|
| 191 |
+
|
| 192 |
+
bins.append({
|
| 193 |
+
"bin_start": round(bin_start, 3),
|
| 194 |
+
"bin_end": round(bin_end, 3),
|
| 195 |
+
"bin_pct": int(bin_start * 100),
|
| 196 |
+
"count": count,
|
| 197 |
+
"new_dataset_count": new_dataset_count,
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
# Normalize counts for display (max height = 100%)
|
| 201 |
+
max_count = max(b["count"] for b in bins) if bins else 1
|
| 202 |
+
for b in bins:
|
| 203 |
+
b["height_pct"] = int((b["count"] / max_count) * 100) if max_count > 0 else 0
|
| 204 |
+
b["new_height_pct"] = int((b["new_dataset_count"] / max_count) * 100) if max_count > 0 else 0
|
| 205 |
+
|
| 206 |
+
# Calculate cumulative counts from each threshold
|
| 207 |
+
# (how many papers are at or above this threshold)
|
| 208 |
+
total_so_far = all_papers.height
|
| 209 |
+
for b in bins:
|
| 210 |
+
b["papers_above"] = total_so_far
|
| 211 |
+
total_so_far -= b["count"]
|
| 212 |
+
|
| 213 |
+
return {
|
| 214 |
+
"bins": bins,
|
| 215 |
+
"min_pct": round(min_pct, 2),
|
| 216 |
+
"max_pct": round(max_pct, 2),
|
| 217 |
+
"total_papers": all_papers.height,
|
| 218 |
+
"new_dataset_count": all_papers.filter(pl.col("is_new_dataset")).height,
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def parse_since(since: str) -> Optional[date]:
|
| 223 |
+
"""Parse 'since' parameter to a date. Returns None for 'all time'."""
|
| 224 |
+
if not since:
|
| 225 |
+
return None
|
| 226 |
+
today = date.today()
|
| 227 |
+
if since == "1m":
|
| 228 |
+
return today - timedelta(days=30)
|
| 229 |
+
elif since == "6m":
|
| 230 |
+
return today - timedelta(days=180)
|
| 231 |
+
elif since == "1y":
|
| 232 |
+
return today - timedelta(days=365)
|
| 233 |
+
return None
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def filter_papers(
|
| 237 |
+
df: pl.DataFrame,
|
| 238 |
+
category: Optional[str] = None,
|
| 239 |
+
search: Optional[str] = None,
|
| 240 |
+
min_confidence: float = 0.5,
|
| 241 |
+
since: Optional[str] = None,
|
| 242 |
+
) -> pl.DataFrame:
|
| 243 |
+
"""Apply filters to the papers dataframe.
|
| 244 |
+
|
| 245 |
+
The confidence threshold controls which papers are shown:
|
| 246 |
+
- Papers with is_new_dataset=True have confidence >= 0.5
|
| 247 |
+
- Setting threshold to 0 shows all papers
|
| 248 |
+
- Setting threshold >= 0.5 effectively shows only new_dataset papers
|
| 249 |
"""
|
| 250 |
+
if min_confidence > 0:
|
| 251 |
+
df = df.filter(pl.col("confidence_score") >= min_confidence)
|
| 252 |
+
|
| 253 |
+
if category:
|
| 254 |
+
df = df.filter(pl.col("categories").str.contains(category))
|
| 255 |
+
|
| 256 |
+
if search:
|
| 257 |
+
search_lower = search.lower()
|
| 258 |
+
df = df.filter(
|
| 259 |
+
pl.col("title").str.to_lowercase().str.contains(search_lower)
|
| 260 |
+
| pl.col("abstract").str.to_lowercase().str.contains(search_lower)
|
| 261 |
+
)
|
| 262 |
|
| 263 |
+
# Date filter
|
| 264 |
+
min_date = parse_since(since)
|
| 265 |
+
if min_date:
|
| 266 |
+
df = df.filter(pl.col("update_date") >= min_date)
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
return df
|
| 269 |
|
| 270 |
|
| 271 |
+
def paginate_papers(
|
| 272 |
+
df: pl.DataFrame,
|
| 273 |
+
page: int = 1,
|
| 274 |
+
per_page: int = 20,
|
| 275 |
+
sort: str = "date",
|
| 276 |
+
) -> tuple[pl.DataFrame, bool]:
|
| 277 |
+
"""Sort and paginate papers, return (page_df, has_more).
|
| 278 |
+
|
| 279 |
+
Sort options:
|
| 280 |
+
- "date": By update_date desc, then confidence_score desc
|
| 281 |
+
- "relevance": Keep existing order (for semantic search similarity)
|
| 282 |
+
"""
|
| 283 |
+
if sort == "date":
|
| 284 |
+
df_sorted = df.sort(
|
| 285 |
+
["update_date", "confidence_score"], descending=[True, True]
|
| 286 |
+
)
|
| 287 |
+
else:
|
| 288 |
+
# "relevance" - keep existing order (already sorted by similarity for semantic)
|
| 289 |
+
df_sorted = df
|
| 290 |
+
|
| 291 |
+
start = (page - 1) * per_page
|
| 292 |
+
page_df = df_sorted.slice(start, per_page + 1)
|
| 293 |
+
has_more = len(page_df) > per_page
|
| 294 |
+
|
| 295 |
+
return page_df.head(per_page), has_more
|
| 296 |
|
| 297 |
|
| 298 |
+
def semantic_search(
|
| 299 |
+
query: str,
|
| 300 |
+
k: int = 100,
|
| 301 |
+
category: Optional[str] = None,
|
| 302 |
+
min_confidence: float = 0.5,
|
| 303 |
+
since: Optional[str] = None,
|
| 304 |
+
) -> pl.DataFrame:
|
| 305 |
+
"""Search using vector similarity via Lance nearest neighbor.
|
| 306 |
|
| 307 |
+
Returns DataFrame with similarity_score column (0-1, higher is more similar).
|
| 308 |
+
"""
|
| 309 |
+
model = get_embedding_model()
|
| 310 |
+
query_embedding = model.encode(query).tolist()
|
| 311 |
+
|
| 312 |
+
ds = get_lance_dataset()
|
| 313 |
+
|
| 314 |
+
# Build SQL filter (Lance supports SQL-like syntax)
|
| 315 |
+
filters = []
|
| 316 |
+
if min_confidence > 0:
|
| 317 |
+
filters.append(f"confidence_score >= {min_confidence}")
|
| 318 |
+
if category:
|
| 319 |
+
# Escape single quotes in category name for SQL safety
|
| 320 |
+
safe_category = category.replace("'", "''")
|
| 321 |
+
filters.append(f"categories LIKE '%{safe_category}%'")
|
| 322 |
+
# Date filter - use TIMESTAMP literal for Lance/DataFusion
|
| 323 |
+
min_date = parse_since(since)
|
| 324 |
+
if min_date:
|
| 325 |
+
filters.append(f"update_date >= TIMESTAMP '{min_date.isoformat()} 00:00:00'")
|
| 326 |
+
filter_str = " AND ".join(filters) if filters else None
|
| 327 |
|
| 328 |
+
# Vector search - include _distance for similarity calculation
|
| 329 |
+
results = ds.scanner(
|
| 330 |
+
nearest={"column": "embedding", "q": query_embedding, "k": k},
|
| 331 |
+
filter=filter_str,
|
| 332 |
+
columns=["id", "title", "abstract", "categories", "update_date",
|
| 333 |
+
"authors", "confidence_score", "_distance"]
|
| 334 |
+
).to_table()
|
| 335 |
|
| 336 |
+
df = pl.from_arrow(results)
|
| 337 |
+
|
| 338 |
+
# Convert L2 distance to similarity score (0-1 range)
|
| 339 |
+
# For normalized embeddings: similarity = 1 - distance/2
|
| 340 |
+
# BGE embeddings are normalized, so L2 distance ranges from 0 to 2
|
| 341 |
+
df = df.with_columns(
|
| 342 |
+
(1 - pl.col("_distance") / 2).clip(0, 1).alias("similarity_score")
|
| 343 |
+
).drop("_distance")
|
| 344 |
+
|
| 345 |
+
return df
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
@app.get("/", response_class=HTMLResponse)
|
| 349 |
+
async def home(
|
| 350 |
+
request: Request,
|
| 351 |
+
search: Optional[str] = Query(None),
|
| 352 |
+
search_type: str = Query("keyword"),
|
| 353 |
+
category: Optional[str] = Query(None),
|
| 354 |
+
min_confidence: str = Query("0.5"), # String to preserve exact value for template
|
| 355 |
+
since: Optional[str] = Query(None),
|
| 356 |
+
sort: str = Query("date"),
|
| 357 |
+
):
|
| 358 |
+
"""Render the home page with optional initial filter state from URL."""
|
| 359 |
+
df = get_dataframe()
|
| 360 |
+
categories = get_categories()
|
| 361 |
+
histogram_data = get_histogram_data()
|
| 362 |
+
|
| 363 |
+
# Get stats
|
| 364 |
+
total_papers = len(df)
|
| 365 |
+
new_dataset_count = df.filter(pl.col("is_new_dataset")).height
|
| 366 |
+
|
| 367 |
+
return templates.TemplateResponse(
|
| 368 |
+
"index.html",
|
| 369 |
+
{
|
| 370 |
+
"request": request,
|
| 371 |
+
"categories": categories,
|
| 372 |
+
"total_papers": total_papers,
|
| 373 |
+
"new_dataset_count": new_dataset_count,
|
| 374 |
+
"histogram_data": histogram_data,
|
| 375 |
+
# Pass filter state for URL persistence
|
| 376 |
+
"search": search or "",
|
| 377 |
+
"search_type": search_type,
|
| 378 |
+
"category": category or "",
|
| 379 |
+
"min_confidence": min_confidence,
|
| 380 |
+
"since": since or "",
|
| 381 |
+
"sort": sort,
|
| 382 |
+
},
|
| 383 |
)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
@app.get("/papers", response_class=HTMLResponse)
|
| 387 |
+
async def get_papers(
|
| 388 |
+
request: Request,
|
| 389 |
+
page: int = Query(1, ge=1),
|
| 390 |
+
per_page: int = Query(20, ge=1, le=100),
|
| 391 |
+
category: Optional[str] = Query(None),
|
| 392 |
+
search: Optional[str] = Query(None),
|
| 393 |
+
min_confidence: float = Query(0.5, ge=0, le=1),
|
| 394 |
+
search_type: str = Query("keyword"), # "keyword" or "semantic"
|
| 395 |
+
sort: str = Query("date"), # "date" or "relevance"
|
| 396 |
+
since: Optional[str] = Query(None), # "1m", "6m", "1y", or None for all
|
| 397 |
+
):
|
| 398 |
+
"""Get paginated and filtered papers (returns HTML partial for HTMX).
|
| 399 |
+
|
| 400 |
+
If accessed directly (not via HTMX), redirects to home page with same params.
|
| 401 |
+
"""
|
| 402 |
+
# Redirect direct browser visits to home page (this endpoint returns partials)
|
| 403 |
+
if "HX-Request" not in request.headers:
|
| 404 |
+
# Build redirect URL with current query params
|
| 405 |
+
query_string = str(request.url.query)
|
| 406 |
+
redirect_url = f"/?{query_string}" if query_string else "/"
|
| 407 |
+
return RedirectResponse(url=redirect_url, status_code=302)
|
| 408 |
+
|
| 409 |
+
if search and search_type == "semantic":
|
| 410 |
+
# Vector search - returns pre-sorted by similarity
|
| 411 |
+
filtered_df = semantic_search(
|
| 412 |
+
query=search,
|
| 413 |
+
k=per_page * 5, # Get more for pagination buffer
|
| 414 |
+
category=category,
|
| 415 |
+
min_confidence=min_confidence,
|
| 416 |
+
since=since,
|
| 417 |
)
|
| 418 |
+
# Default to relevance sort for semantic, but allow date sort
|
| 419 |
+
effective_sort = sort if sort == "date" else "relevance"
|
| 420 |
+
page_df, has_more = paginate_papers(
|
| 421 |
+
filtered_df, page=page, per_page=per_page, sort=effective_sort
|
| 422 |
)
|
| 423 |
+
else:
|
| 424 |
+
# Existing keyword search path
|
| 425 |
+
df = get_dataframe()
|
| 426 |
+
filtered_df = filter_papers(
|
| 427 |
+
df,
|
| 428 |
+
category=category,
|
| 429 |
+
search=search,
|
| 430 |
+
min_confidence=min_confidence,
|
| 431 |
+
since=since,
|
| 432 |
)
|
| 433 |
+
# Keyword search always sorts by date
|
| 434 |
+
page_df, has_more = paginate_papers(
|
| 435 |
+
filtered_df, page=page, per_page=per_page, sort="date"
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
# Convert to list of dicts for template
|
| 439 |
+
papers = page_df.to_dicts()
|
| 440 |
+
|
| 441 |
+
# Build clean URL for browser history (/ instead of /papers)
|
| 442 |
+
# Only include non-default values to keep URLs short
|
| 443 |
+
params = {}
|
| 444 |
+
if search:
|
| 445 |
+
params["search"] = search
|
| 446 |
+
if search_type != "keyword":
|
| 447 |
+
params["search_type"] = search_type
|
| 448 |
+
if category:
|
| 449 |
+
params["category"] = category
|
| 450 |
+
if min_confidence != 0.5:
|
| 451 |
+
params["min_confidence"] = min_confidence
|
| 452 |
+
if since:
|
| 453 |
+
params["since"] = since
|
| 454 |
+
if sort != "date":
|
| 455 |
+
params["sort"] = sort
|
| 456 |
+
push_url = "/?" + urlencode(params) if params else "/"
|
| 457 |
+
|
| 458 |
+
response = templates.TemplateResponse(
|
| 459 |
+
"partials/paper_list.html",
|
| 460 |
+
{
|
| 461 |
+
"request": request,
|
| 462 |
+
"papers": papers,
|
| 463 |
+
"page": page,
|
| 464 |
+
"has_more": has_more,
|
| 465 |
+
"category": category or "",
|
| 466 |
+
"search": search or "",
|
| 467 |
+
"min_confidence": min_confidence,
|
| 468 |
+
"search_type": search_type,
|
| 469 |
+
"sort": sort,
|
| 470 |
+
"since": since or "",
|
| 471 |
+
"total_filtered": len(filtered_df),
|
| 472 |
+
},
|
| 473 |
)
|
| 474 |
+
# Tell HTMX to push clean URL (/ not /papers)
|
| 475 |
+
response.headers["HX-Push-Url"] = push_url
|
| 476 |
+
return response
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
@app.get("/api/stats")
|
| 480 |
+
async def get_stats():
|
| 481 |
+
"""Get dataset statistics as JSON."""
|
| 482 |
+
df = get_dataframe()
|
| 483 |
+
|
| 484 |
+
new_datasets = df.filter(pl.col("is_new_dataset"))
|
| 485 |
+
|
| 486 |
+
return {
|
| 487 |
+
"total_papers": len(df),
|
| 488 |
+
"new_dataset_count": len(new_datasets),
|
| 489 |
+
"avg_confidence": float(df["confidence_score"].mean()),
|
| 490 |
+
"date_range": {
|
| 491 |
+
"min": str(df["update_date"].min()),
|
| 492 |
+
"max": str(df["update_date"].max()),
|
| 493 |
+
},
|
| 494 |
+
}
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
# Preload dataset and model on startup
|
| 498 |
+
@app.on_event("startup")
|
| 499 |
+
async def startup_event():
|
| 500 |
+
"""Preload dataset and embedding model on startup."""
|
| 501 |
+
print("Preloading dataset...")
|
| 502 |
+
get_dataframe()
|
| 503 |
+
print("Dataset loaded!")
|
| 504 |
+
print("Preloading embedding model...")
|
| 505 |
+
get_embedding_model()
|
| 506 |
+
print("Embedding model loaded!")
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
if __name__ == "__main__":
|
| 510 |
+
import uvicorn
|
| 511 |
|
| 512 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
CHANGED
|
@@ -1,354 +1,11 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# via gradio
|
| 13 |
-
annotated-types==0.7.0
|
| 14 |
-
# via pydantic
|
| 15 |
-
anyio==4.4.0
|
| 16 |
-
# via
|
| 17 |
-
# httpx
|
| 18 |
-
# starlette
|
| 19 |
-
# watchfiles
|
| 20 |
-
apscheduler==3.10.4
|
| 21 |
-
# via -r requirements.in
|
| 22 |
-
arxiv==2.1.0
|
| 23 |
-
# via -r requirements.in
|
| 24 |
-
attrs==23.2.0
|
| 25 |
-
# via
|
| 26 |
-
# aiohttp
|
| 27 |
-
# jsonschema
|
| 28 |
-
# referencing
|
| 29 |
-
cachetools==5.3.3
|
| 30 |
-
# via -r requirements.in
|
| 31 |
-
certifi==2024.6.2
|
| 32 |
-
# via
|
| 33 |
-
# httpcore
|
| 34 |
-
# httpx
|
| 35 |
-
# requests
|
| 36 |
-
charset-normalizer==3.3.2
|
| 37 |
-
# via requests
|
| 38 |
-
click==8.1.7
|
| 39 |
-
# via
|
| 40 |
-
# typer
|
| 41 |
-
# uvicorn
|
| 42 |
-
contourpy==1.2.1
|
| 43 |
-
# via matplotlib
|
| 44 |
-
cycler==0.12.1
|
| 45 |
-
# via matplotlib
|
| 46 |
-
datasets==2.14.4
|
| 47 |
-
# via
|
| 48 |
-
# evaluate
|
| 49 |
-
# setfit
|
| 50 |
-
dill==0.3.7
|
| 51 |
-
# via
|
| 52 |
-
# datasets
|
| 53 |
-
# evaluate
|
| 54 |
-
# multiprocess
|
| 55 |
-
dnspython==2.6.1
|
| 56 |
-
# via email-validator
|
| 57 |
-
email-validator==2.1.1
|
| 58 |
-
# via fastapi
|
| 59 |
-
evaluate==0.4.2
|
| 60 |
-
# via setfit
|
| 61 |
-
fastapi==0.111.0
|
| 62 |
-
# via gradio
|
| 63 |
-
fastapi-cli==0.0.4
|
| 64 |
-
# via fastapi
|
| 65 |
-
feedparser==6.0.10
|
| 66 |
-
# via arxiv
|
| 67 |
-
ffmpy==0.3.2
|
| 68 |
-
# via gradio
|
| 69 |
-
filelock==3.14.0
|
| 70 |
-
# via
|
| 71 |
-
# huggingface-hub
|
| 72 |
-
# torch
|
| 73 |
-
# transformers
|
| 74 |
-
fonttools==4.53.0
|
| 75 |
-
# via matplotlib
|
| 76 |
-
frozenlist==1.4.1
|
| 77 |
-
# via
|
| 78 |
-
# aiohttp
|
| 79 |
-
# aiosignal
|
| 80 |
-
fsspec==2024.6.0
|
| 81 |
-
# via
|
| 82 |
-
# datasets
|
| 83 |
-
# evaluate
|
| 84 |
-
# gradio-client
|
| 85 |
-
# huggingface-hub
|
| 86 |
-
# torch
|
| 87 |
-
gradio==4.36.1
|
| 88 |
-
# via -r requirements.in
|
| 89 |
-
gradio-client==1.0.1
|
| 90 |
-
# via gradio
|
| 91 |
-
h11==0.14.0
|
| 92 |
-
# via
|
| 93 |
-
# httpcore
|
| 94 |
-
# uvicorn
|
| 95 |
-
hf-transfer==0.1.6
|
| 96 |
-
# via -r requirements.in
|
| 97 |
-
httpcore==1.0.5
|
| 98 |
-
# via httpx
|
| 99 |
-
httptools==0.6.1
|
| 100 |
-
# via uvicorn
|
| 101 |
-
httpx==0.27.0
|
| 102 |
-
# via
|
| 103 |
-
# fastapi
|
| 104 |
-
# gradio
|
| 105 |
-
# gradio-client
|
| 106 |
-
huggingface-hub==0.23.3
|
| 107 |
-
# via
|
| 108 |
-
# datasets
|
| 109 |
-
# evaluate
|
| 110 |
-
# gradio
|
| 111 |
-
# gradio-client
|
| 112 |
-
# sentence-transformers
|
| 113 |
-
# setfit
|
| 114 |
-
# tokenizers
|
| 115 |
-
# transformers
|
| 116 |
-
idna==3.7
|
| 117 |
-
# via
|
| 118 |
-
# anyio
|
| 119 |
-
# email-validator
|
| 120 |
-
# httpx
|
| 121 |
-
# requests
|
| 122 |
-
# yarl
|
| 123 |
-
importlib-resources==6.4.0
|
| 124 |
-
# via gradio
|
| 125 |
-
jinja2==3.1.4
|
| 126 |
-
# via
|
| 127 |
-
# altair
|
| 128 |
-
# fastapi
|
| 129 |
-
# gradio
|
| 130 |
-
# torch
|
| 131 |
-
joblib==1.4.2
|
| 132 |
-
# via scikit-learn
|
| 133 |
-
jsonschema==4.22.0
|
| 134 |
-
# via altair
|
| 135 |
-
jsonschema-specifications==2023.12.1
|
| 136 |
-
# via jsonschema
|
| 137 |
-
kiwisolver==1.4.5
|
| 138 |
-
# via matplotlib
|
| 139 |
-
markdown-it-py==3.0.0
|
| 140 |
-
# via rich
|
| 141 |
-
markupsafe==2.1.5
|
| 142 |
-
# via
|
| 143 |
-
# gradio
|
| 144 |
-
# jinja2
|
| 145 |
-
matplotlib==3.9.0
|
| 146 |
-
# via gradio
|
| 147 |
-
mdurl==0.1.2
|
| 148 |
-
# via markdown-it-py
|
| 149 |
-
mpmath==1.3.0
|
| 150 |
-
# via sympy
|
| 151 |
-
multidict==6.0.5
|
| 152 |
-
# via
|
| 153 |
-
# aiohttp
|
| 154 |
-
# yarl
|
| 155 |
-
multiprocess==0.70.15
|
| 156 |
-
# via
|
| 157 |
-
# datasets
|
| 158 |
-
# evaluate
|
| 159 |
-
networkx==3.3
|
| 160 |
-
# via torch
|
| 161 |
-
numpy==1.26.4
|
| 162 |
-
# via
|
| 163 |
-
# altair
|
| 164 |
-
# contourpy
|
| 165 |
-
# datasets
|
| 166 |
-
# evaluate
|
| 167 |
-
# gradio
|
| 168 |
-
# matplotlib
|
| 169 |
-
# pandas
|
| 170 |
-
# pyarrow
|
| 171 |
-
# scikit-learn
|
| 172 |
-
# scipy
|
| 173 |
-
# sentence-transformers
|
| 174 |
-
# transformers
|
| 175 |
-
orjson==3.10.3
|
| 176 |
-
# via
|
| 177 |
-
# fastapi
|
| 178 |
-
# gradio
|
| 179 |
-
packaging==24.1
|
| 180 |
-
# via
|
| 181 |
-
# altair
|
| 182 |
-
# datasets
|
| 183 |
-
# evaluate
|
| 184 |
-
# gradio
|
| 185 |
-
# gradio-client
|
| 186 |
-
# huggingface-hub
|
| 187 |
-
# matplotlib
|
| 188 |
-
# setfit
|
| 189 |
-
# transformers
|
| 190 |
-
pandas==2.2.2
|
| 191 |
-
# via
|
| 192 |
-
# altair
|
| 193 |
-
# datasets
|
| 194 |
-
# evaluate
|
| 195 |
-
# gradio
|
| 196 |
-
pillow==10.3.0
|
| 197 |
-
# via
|
| 198 |
-
# gradio
|
| 199 |
-
# matplotlib
|
| 200 |
-
# sentence-transformers
|
| 201 |
-
pyarrow==16.1.0
|
| 202 |
-
# via datasets
|
| 203 |
-
pydantic==2.7.3
|
| 204 |
-
# via
|
| 205 |
-
# fastapi
|
| 206 |
-
# gradio
|
| 207 |
-
pydantic-core==2.18.4
|
| 208 |
-
# via pydantic
|
| 209 |
-
pydub==0.25.1
|
| 210 |
-
# via gradio
|
| 211 |
-
pygments==2.18.0
|
| 212 |
-
# via rich
|
| 213 |
-
pyparsing==3.1.2
|
| 214 |
-
# via matplotlib
|
| 215 |
-
python-dateutil==2.9.0.post0
|
| 216 |
-
# via
|
| 217 |
-
# matplotlib
|
| 218 |
-
# pandas
|
| 219 |
-
python-dotenv==1.0.1
|
| 220 |
-
# via uvicorn
|
| 221 |
-
python-multipart==0.0.9
|
| 222 |
-
# via
|
| 223 |
-
# fastapi
|
| 224 |
-
# gradio
|
| 225 |
-
pytz==2024.1
|
| 226 |
-
# via
|
| 227 |
-
# apscheduler
|
| 228 |
-
# pandas
|
| 229 |
-
pyyaml==6.0.1
|
| 230 |
-
# via
|
| 231 |
-
# datasets
|
| 232 |
-
# gradio
|
| 233 |
-
# huggingface-hub
|
| 234 |
-
# transformers
|
| 235 |
-
# uvicorn
|
| 236 |
-
referencing==0.35.1
|
| 237 |
-
# via
|
| 238 |
-
# jsonschema
|
| 239 |
-
# jsonschema-specifications
|
| 240 |
-
regex==2024.5.15
|
| 241 |
-
# via transformers
|
| 242 |
-
requests==2.31.0
|
| 243 |
-
# via
|
| 244 |
-
# arxiv
|
| 245 |
-
# datasets
|
| 246 |
-
# evaluate
|
| 247 |
-
# huggingface-hub
|
| 248 |
-
# transformers
|
| 249 |
-
rich==13.7.1
|
| 250 |
-
# via typer
|
| 251 |
-
rpds-py==0.18.1
|
| 252 |
-
# via
|
| 253 |
-
# jsonschema
|
| 254 |
-
# referencing
|
| 255 |
-
ruff==0.4.8
|
| 256 |
-
# via gradio
|
| 257 |
-
safetensors==0.4.3
|
| 258 |
-
# via transformers
|
| 259 |
-
scikit-learn==1.2.2
|
| 260 |
-
# via
|
| 261 |
-
# -r requirements.in
|
| 262 |
-
# sentence-transformers
|
| 263 |
-
# setfit
|
| 264 |
-
scipy==1.13.1
|
| 265 |
-
# via
|
| 266 |
-
# scikit-learn
|
| 267 |
-
# sentence-transformers
|
| 268 |
-
semantic-version==2.10.0
|
| 269 |
-
# via gradio
|
| 270 |
-
sentence-transformers==3.0.1
|
| 271 |
-
# via setfit
|
| 272 |
-
setfit==1.0.3
|
| 273 |
-
# via -r requirements.in
|
| 274 |
-
sgmllib3k==1.0.0
|
| 275 |
-
# via feedparser
|
| 276 |
-
shellingham==1.5.4
|
| 277 |
-
# via typer
|
| 278 |
-
six==1.16.0
|
| 279 |
-
# via
|
| 280 |
-
# apscheduler
|
| 281 |
-
# python-dateutil
|
| 282 |
-
sniffio==1.3.1
|
| 283 |
-
# via
|
| 284 |
-
# anyio
|
| 285 |
-
# httpx
|
| 286 |
-
stamina==24.2.0
|
| 287 |
-
# via -r requirements.in
|
| 288 |
-
starlette==0.37.2
|
| 289 |
-
# via fastapi
|
| 290 |
-
sympy==1.12.1
|
| 291 |
-
# via torch
|
| 292 |
-
tenacity==8.3.0
|
| 293 |
-
# via stamina
|
| 294 |
-
threadpoolctl==3.5.0
|
| 295 |
-
# via scikit-learn
|
| 296 |
-
tokenizers==0.19.1
|
| 297 |
-
# via transformers
|
| 298 |
-
tomlkit==0.12.0
|
| 299 |
-
# via gradio
|
| 300 |
-
toolz==0.12.1
|
| 301 |
-
# via altair
|
| 302 |
-
torch==2.3.1
|
| 303 |
-
# via sentence-transformers
|
| 304 |
-
tqdm==4.66.4
|
| 305 |
-
# via
|
| 306 |
-
# datasets
|
| 307 |
-
# evaluate
|
| 308 |
-
# huggingface-hub
|
| 309 |
-
# sentence-transformers
|
| 310 |
-
# transformers
|
| 311 |
-
transformers==4.41.2
|
| 312 |
-
# via sentence-transformers
|
| 313 |
-
typer==0.12.3
|
| 314 |
-
# via
|
| 315 |
-
# fastapi-cli
|
| 316 |
-
# gradio
|
| 317 |
-
typing-extensions==4.12.2
|
| 318 |
-
# via
|
| 319 |
-
# fastapi
|
| 320 |
-
# gradio
|
| 321 |
-
# gradio-client
|
| 322 |
-
# huggingface-hub
|
| 323 |
-
# pydantic
|
| 324 |
-
# pydantic-core
|
| 325 |
-
# torch
|
| 326 |
-
# typer
|
| 327 |
-
tzdata==2024.1
|
| 328 |
-
# via pandas
|
| 329 |
-
tzlocal==5.2
|
| 330 |
-
# via apscheduler
|
| 331 |
-
ujson==5.10.0
|
| 332 |
-
# via fastapi
|
| 333 |
-
urllib3==2.2.1
|
| 334 |
-
# via
|
| 335 |
-
# gradio
|
| 336 |
-
# requests
|
| 337 |
-
uvicorn==0.30.1
|
| 338 |
-
# via
|
| 339 |
-
# fastapi
|
| 340 |
-
# gradio
|
| 341 |
-
uvloop==0.19.0
|
| 342 |
-
# via uvicorn
|
| 343 |
-
watchfiles==0.22.0
|
| 344 |
-
# via uvicorn
|
| 345 |
-
websockets==11.0.3
|
| 346 |
-
# via
|
| 347 |
-
# gradio-client
|
| 348 |
-
# uvicorn
|
| 349 |
-
xxhash==3.4.1
|
| 350 |
-
# via
|
| 351 |
-
# datasets
|
| 352 |
-
# evaluate
|
| 353 |
-
yarl==1.9.4
|
| 354 |
-
# via aiohttp
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
jinja2
|
| 4 |
+
markupsafe
|
| 5 |
+
polars
|
| 6 |
+
huggingface-hub[hf_transfer]
|
| 7 |
+
python-dotenv
|
| 8 |
+
cachetools
|
| 9 |
+
pyarrow
|
| 10 |
+
pylance>=0.20
|
| 11 |
+
sentence-transformers>=3.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/styles.css
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* Custom styles - most styling is via Tailwind CDN */
|
| 2 |
+
|
| 3 |
+
/* Ensure smooth scrolling */
|
| 4 |
+
html {
|
| 5 |
+
scroll-behavior: smooth;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
/* Better focus styles */
|
| 9 |
+
:focus-visible {
|
| 10 |
+
outline: 2px solid #3b82f6;
|
| 11 |
+
outline-offset: 2px;
|
| 12 |
+
}
|
templates/base.html
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>{% block title %}Dataset Papers on ArXiv{% endblock %}</title>
|
| 7 |
+
|
| 8 |
+
<!-- Tailwind CSS -->
|
| 9 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
| 10 |
+
|
| 11 |
+
<!-- HTMX -->
|
| 12 |
+
<script src="https://unpkg.com/htmx.org@1.9.12"></script>
|
| 13 |
+
|
| 14 |
+
<style>
|
| 15 |
+
/* Loading indicator - subtle */
|
| 16 |
+
.htmx-indicator { display: none; }
|
| 17 |
+
.htmx-request .htmx-indicator,
|
| 18 |
+
.htmx-request.htmx-indicator { display: inline; }
|
| 19 |
+
|
| 20 |
+
/* Content fades during load */
|
| 21 |
+
.htmx-request #paper-list { opacity: 0.5; transition: opacity 0.15s; }
|
| 22 |
+
</style>
|
| 23 |
+
</head>
|
| 24 |
+
<body class="bg-white min-h-screen text-gray-900">
|
| 25 |
+
<header class="border-b border-gray-200">
|
| 26 |
+
<div class="max-w-3xl mx-auto px-4 py-6">
|
| 27 |
+
<h1 class="text-xl font-semibold">Dataset Papers on ArXiv</h1>
|
| 28 |
+
<p class="text-sm text-gray-500 mt-1">CS papers predicted to introduce new ML datasets by <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="inline-flex items-center gap-1 text-gray-700 hover:text-blue-600 font-medium">this model <span class="text-base">🤗</span></a></p>
|
| 29 |
+
</div>
|
| 30 |
+
</header>
|
| 31 |
+
|
| 32 |
+
<main class="max-w-3xl mx-auto px-4 py-6">
|
| 33 |
+
{% block content %}{% endblock %}
|
| 34 |
+
</main>
|
| 35 |
+
|
| 36 |
+
<footer class="border-t border-gray-100 mt-12">
|
| 37 |
+
<div class="max-w-3xl mx-auto px-4 py-4 text-gray-400 text-xs">
|
| 38 |
+
<a href="https://huggingface.co/datasets/davanstrien/my-classified-papers" class="hover:text-gray-600">Data source</a>
|
| 39 |
+
<span class="mx-2">·</span>
|
| 40 |
+
<a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="hover:text-gray-600">Model</a>
|
| 41 |
+
</div>
|
| 42 |
+
</footer>
|
| 43 |
+
</body>
|
| 44 |
+
</html>
|
templates/index.html
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% extends "base.html" %}
|
| 2 |
+
|
| 3 |
+
{% block content %}
|
| 4 |
+
<div>
|
| 5 |
+
<!-- Powered by -->
|
| 6 |
+
<div class="text-xs text-gray-400 mb-4">
|
| 7 |
+
Vector search powered by <a href="https://lancedb.github.io/lance/" class="underline hover:text-gray-600">Lance</a>
|
| 8 |
+
· Updated weekly via <a href="https://huggingface.co/docs/hub/en/spaces-run-with-hf-jobs" class="underline hover:text-gray-600">HF Jobs</a>
|
| 9 |
+
· <a href="https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance" class="underline hover:text-gray-600">Dataset</a>
|
| 10 |
+
</div>
|
| 11 |
+
|
| 12 |
+
<!-- Stats - minimal -->
|
| 13 |
+
<div class="flex items-baseline gap-2 mb-6">
|
| 14 |
+
<span class="text-3xl font-semibold text-gray-900">{{ "{:,}".format(new_dataset_count) }}</span>
|
| 15 |
+
<span class="text-gray-500">papers with new datasets</span>
|
| 16 |
+
<span class="text-gray-400 text-sm ml-auto">from {{ "{:,}".format(total_papers) }} total</span>
|
| 17 |
+
</div>
|
| 18 |
+
|
| 19 |
+
<!-- Filters - sticky on scroll -->
|
| 20 |
+
<div class="sticky top-0 z-10 bg-white flex flex-wrap items-center gap-4 py-4 border-b border-gray-200 mb-6">
|
| 21 |
+
<!-- Search -->
|
| 22 |
+
<input type="search"
|
| 23 |
+
name="search"
|
| 24 |
+
id="search-input"
|
| 25 |
+
placeholder="Search..."
|
| 26 |
+
value="{{ search }}"
|
| 27 |
+
class="flex-1 min-w-48 px-3 py-2 border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none"
|
| 28 |
+
hx-get="/papers"
|
| 29 |
+
hx-trigger="input changed delay:500ms, keyup[key=='Enter'], histogramChange"
|
| 30 |
+
hx-target="#paper-list"
|
| 31 |
+
hx-include="#filter-form, #category-select, #confidence-filter, #since-filter, #sort-select, #search-type-toggle"
|
| 32 |
+
hx-indicator="#loading-indicator"
|
| 33 |
+
hx-push-url="true">
|
| 34 |
+
|
| 35 |
+
<!-- Search mode toggle -->
|
| 36 |
+
<div id="search-type-toggle" class="flex items-center gap-2 text-xs text-gray-500">
|
| 37 |
+
<label class="flex items-center gap-1 cursor-pointer">
|
| 38 |
+
<input type="radio" name="search_type" value="keyword" {% if search_type == 'keyword' %}checked{% endif %}
|
| 39 |
+
class="h-3 w-3"
|
| 40 |
+
hx-get="/papers"
|
| 41 |
+
hx-trigger="change"
|
| 42 |
+
hx-target="#paper-list"
|
| 43 |
+
hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #sort-select"
|
| 44 |
+
hx-indicator="#loading-indicator"
|
| 45 |
+
hx-push-url="true">
|
| 46 |
+
<span>Keyword</span>
|
| 47 |
+
</label>
|
| 48 |
+
<label class="flex items-center gap-1 cursor-pointer">
|
| 49 |
+
<input type="radio" name="search_type" value="semantic" {% if search_type == 'semantic' %}checked{% endif %}
|
| 50 |
+
class="h-3 w-3"
|
| 51 |
+
hx-get="/papers"
|
| 52 |
+
hx-trigger="change"
|
| 53 |
+
hx-target="#paper-list"
|
| 54 |
+
hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #sort-select"
|
| 55 |
+
hx-indicator="#loading-indicator"
|
| 56 |
+
hx-push-url="true">
|
| 57 |
+
<span>Semantic</span>
|
| 58 |
+
</label>
|
| 59 |
+
</div>
|
| 60 |
+
|
| 61 |
+
<!-- Category filter -->
|
| 62 |
+
<select name="category"
|
| 63 |
+
id="category-select"
|
| 64 |
+
class="px-3 py-2 border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-700"
|
| 65 |
+
hx-get="/papers"
|
| 66 |
+
hx-trigger="change"
|
| 67 |
+
hx-target="#paper-list"
|
| 68 |
+
hx-include="#filter-form, #search-input, #confidence-filter, #since-filter, #sort-select, #search-type-toggle"
|
| 69 |
+
hx-indicator="#loading-indicator"
|
| 70 |
+
hx-push-url="true">
|
| 71 |
+
<option value="">All categories</option>
|
| 72 |
+
{% for cat in categories %}
|
| 73 |
+
<option value="{{ cat }}" {% if category == cat %}selected{% endif %}>{{ cat }}</option>
|
| 74 |
+
{% endfor %}
|
| 75 |
+
</select>
|
| 76 |
+
|
| 77 |
+
<!-- Confidence filter dropdown -->
|
| 78 |
+
<select name="min_confidence"
|
| 79 |
+
id="confidence-filter"
|
| 80 |
+
class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500 ml-auto"
|
| 81 |
+
hx-get="/papers"
|
| 82 |
+
hx-trigger="change"
|
| 83 |
+
hx-target="#paper-list"
|
| 84 |
+
hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
|
| 85 |
+
hx-indicator="#loading-indicator"
|
| 86 |
+
hx-push-url="true">
|
| 87 |
+
<option value="0.5" {% if min_confidence == '0.5' %}selected{% endif %}>New datasets only</option>
|
| 88 |
+
<option value="0.6" {% if min_confidence == '0.6' %}selected{% endif %}>Higher confidence</option>
|
| 89 |
+
<option value="0" {% if min_confidence == '0' %}selected{% endif %}>All papers</option>
|
| 90 |
+
</select>
|
| 91 |
+
|
| 92 |
+
<!-- Since filter dropdown -->
|
| 93 |
+
<select name="since"
|
| 94 |
+
id="since-filter"
|
| 95 |
+
class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500"
|
| 96 |
+
hx-get="/papers"
|
| 97 |
+
hx-trigger="change"
|
| 98 |
+
hx-target="#paper-list"
|
| 99 |
+
hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #sort-select, #search-type-toggle"
|
| 100 |
+
hx-indicator="#loading-indicator"
|
| 101 |
+
hx-push-url="true">
|
| 102 |
+
<option value="" {% if not since %}selected{% endif %}>All time</option>
|
| 103 |
+
<option value="1m" {% if since == '1m' %}selected{% endif %}>Past month</option>
|
| 104 |
+
<option value="6m" {% if since == '6m' %}selected{% endif %}>Past 6 months</option>
|
| 105 |
+
<option value="1y" {% if since == '1y' %}selected{% endif %}>Past year</option>
|
| 106 |
+
</select>
|
| 107 |
+
|
| 108 |
+
<!-- Sort dropdown -->
|
| 109 |
+
<select name="sort"
|
| 110 |
+
id="sort-select"
|
| 111 |
+
class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500"
|
| 112 |
+
hx-get="/papers"
|
| 113 |
+
hx-trigger="change"
|
| 114 |
+
hx-target="#paper-list"
|
| 115 |
+
hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #search-type-toggle"
|
| 116 |
+
hx-indicator="#loading-indicator"
|
| 117 |
+
hx-push-url="true">
|
| 118 |
+
<option value="date" {% if sort == 'date' %}selected{% endif %}>Newest first</option>
|
| 119 |
+
<option value="relevance" {% if sort == 'relevance' %}selected{% endif %}>Relevance</option>
|
| 120 |
+
</select>
|
| 121 |
+
|
| 122 |
+
<!-- Loading indicator - subtle -->
|
| 123 |
+
<span id="loading-indicator" class="htmx-indicator text-sm text-gray-400">Loading...</span>
|
| 124 |
+
|
| 125 |
+
<!-- Hidden form for hx-include -->
|
| 126 |
+
<form id="filter-form" class="hidden"></form>
|
| 127 |
+
</div>
|
| 128 |
+
|
| 129 |
+
<!-- Paper list -->
|
| 130 |
+
<div id="paper-list"
|
| 131 |
+
hx-get="/papers?{% if search %}search={{ search|urlencode }}&{% endif %}search_type={{ search_type }}&{% if category %}category={{ category|urlencode }}&{% endif %}min_confidence={{ min_confidence }}&{% if since %}since={{ since }}&{% endif %}sort={{ sort }}"
|
| 132 |
+
hx-trigger="load"
|
| 133 |
+
hx-indicator="#loading-indicator">
|
| 134 |
+
<div class="py-8 text-gray-400 text-sm">Loading papers...</div>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
{% endblock %}
|
templates/partials/paper_card.html
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<article class="py-5 border-b border-gray-200">
|
| 2 |
+
<!-- Title with paper icon -->
|
| 3 |
+
<h3 class="text-lg font-semibold text-gray-900 leading-tight">
|
| 4 |
+
<a href="https://huggingface.co/papers/{{ paper.id }}"
|
| 5 |
+
target="_blank"
|
| 6 |
+
class="hover:text-blue-600 inline-flex items-start gap-2 group">
|
| 7 |
+
<svg class="w-5 h-5 mt-0.5 text-gray-400 group-hover:text-blue-500 flex-shrink-0" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 8 |
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"></path>
|
| 9 |
+
</svg>
|
| 10 |
+
<span>{% if search %}{{ paper.title|highlight(search) }}{% else %}{{ paper.title }}{% endif %}</span>
|
| 11 |
+
</a>
|
| 12 |
+
</h3>
|
| 13 |
+
|
| 14 |
+
<!-- Meta info - inline with category badge -->
|
| 15 |
+
{% set category = paper.categories.split(' ')[0] if paper.categories else '' %}
|
| 16 |
+
{% set cat_colors = {
|
| 17 |
+
'cs.CV': 'bg-purple-100 text-purple-700',
|
| 18 |
+
'cs.AI': 'bg-blue-100 text-blue-700',
|
| 19 |
+
'cs.LG': 'bg-green-100 text-green-700',
|
| 20 |
+
'cs.CL': 'bg-orange-100 text-orange-700',
|
| 21 |
+
'cs.NE': 'bg-pink-100 text-pink-700'
|
| 22 |
+
} %}
|
| 23 |
+
{% set badge_class = cat_colors.get(category, 'bg-gray-100 text-gray-600') %}
|
| 24 |
+
<div class="mt-2 flex items-center gap-3 text-sm text-gray-500">
|
| 25 |
+
<span class="px-2 py-0.5 rounded-full text-xs font-medium {{ badge_class }}">{{ category }}</span>
|
| 26 |
+
<span>{{ paper.update_date.strftime('%Y-%m-%d') if paper.update_date else 'Unknown' }}</span>
|
| 27 |
+
{% if search_type == 'semantic' and paper.similarity_score is defined %}
|
| 28 |
+
<span class="text-blue-500 inline-flex items-center gap-1">
|
| 29 |
+
{{ "%.0f"|format(paper.similarity_score * 100) }}% match
|
| 30 |
+
<span class="cursor-help" title="How similar this paper is to your search query">
|
| 31 |
+
<svg class="w-3.5 h-3.5 text-blue-400" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 32 |
+
<circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
|
| 33 |
+
<path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
|
| 34 |
+
<circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
|
| 35 |
+
</svg>
|
| 36 |
+
</span>
|
| 37 |
+
</span>
|
| 38 |
+
<span class="text-gray-400 inline-flex items-center gap-1">
|
| 39 |
+
{{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
|
| 40 |
+
<span class="cursor-help" title="Model confidence this paper introduces a new dataset">
|
| 41 |
+
<svg class="w-3.5 h-3.5 text-gray-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 42 |
+
<circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
|
| 43 |
+
<path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
|
| 44 |
+
<circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
|
| 45 |
+
</svg>
|
| 46 |
+
</span>
|
| 47 |
+
</span>
|
| 48 |
+
{% else %}
|
| 49 |
+
<span class="{% if paper.confidence_score < 0.8 %}text-gray-400{% else %}text-gray-500{% endif %} inline-flex items-center gap-1">
|
| 50 |
+
{{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
|
| 51 |
+
<span class="cursor-help" title="Model confidence this paper introduces a new dataset">
|
| 52 |
+
<svg class="w-3.5 h-3.5 {% if paper.confidence_score < 0.8 %}text-gray-300{% else %}text-gray-400{% endif %}" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
| 53 |
+
<circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
|
| 54 |
+
<path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
|
| 55 |
+
<circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
|
| 56 |
+
</svg>
|
| 57 |
+
</span>
|
| 58 |
+
</span>
|
| 59 |
+
{% endif %}
|
| 60 |
+
</div>
|
| 61 |
+
|
| 62 |
+
<!-- Abstract (truncated) -->
|
| 63 |
+
<p class="mt-2 text-gray-600 text-sm leading-relaxed">
|
| 64 |
+
{% if search %}
|
| 65 |
+
{{ paper.abstract[:400]|highlight(search) }}{% if paper.abstract|length > 400 %}...{% endif %}
|
| 66 |
+
{% else %}
|
| 67 |
+
{{ paper.abstract[:400] }}{% if paper.abstract|length > 400 %}...{% endif %}
|
| 68 |
+
{% endif %}
|
| 69 |
+
</p>
|
| 70 |
+
</article>
|
templates/partials/paper_list.html
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- Paper count - subtle -->
|
| 2 |
+
<div class="text-xs text-gray-400 mb-4">
|
| 3 |
+
{{ "{:,}".format(total_filtered) }} results{% if search %} for "{{ search }}"{% if search_type == 'semantic' %} <span class="text-blue-400">(semantic)</span>{% endif %}{% endif %}{% if category %} in {{ category }}{% endif %}
|
| 4 |
+
</div>
|
| 5 |
+
|
| 6 |
+
<!-- Paper cards -->
|
| 7 |
+
<div>
|
| 8 |
+
{% for paper in papers %}
|
| 9 |
+
{% include "partials/paper_card.html" %}
|
| 10 |
+
{% endfor %}
|
| 11 |
+
</div>
|
| 12 |
+
|
| 13 |
+
{% if papers|length == 0 %}
|
| 14 |
+
<div class="py-12 text-gray-400 text-sm">
|
| 15 |
+
No papers found. Try adjusting your filters.
|
| 16 |
+
</div>
|
| 17 |
+
{% endif %}
|
| 18 |
+
|
| 19 |
+
<!-- Infinite scroll trigger -->
|
| 20 |
+
{% if has_more %}
|
| 21 |
+
<div hx-get="/papers?page={{ page + 1 }}&category={{ category }}&search={{ search }}&min_confidence={{ min_confidence }}&search_type={{ search_type }}&sort={{ sort }}&since={{ since }}"
|
| 22 |
+
hx-trigger="revealed"
|
| 23 |
+
hx-swap="outerHTML"
|
| 24 |
+
class="py-6 text-center text-xs text-gray-400">
|
| 25 |
+
<noscript>
|
| 26 |
+
<a href="/papers?page={{ page + 1 }}" class="hover:text-gray-600">Load more</a>
|
| 27 |
+
</noscript>
|
| 28 |
+
</div>
|
| 29 |
+
{% else %}
|
| 30 |
+
<div class="py-6 text-center text-xs text-gray-300">
|
| 31 |
+
End of results
|
| 32 |
+
</div>
|
| 33 |
+
{% endif %}
|