davanstrien HF Staff commited on
Commit
4cf63e7
·
verified ·
1 Parent(s): fc9caf8

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ data/
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Enable HF transfer for faster downloads
6
+ ENV HF_HUB_ENABLE_HF_TRANSFER=1
7
+
8
+ # Install uv for fast dependency management
9
+ RUN pip install --no-cache-dir uv
10
+
11
+ # Copy and install dependencies
12
+ COPY requirements.txt .
13
+ RUN uv pip install --system --no-cache -r requirements.txt
14
+
15
+ # Copy application
16
+ COPY . .
17
+
18
+ # Create non-root user for security
19
+ RUN useradd -m -u 1000 user
20
+ USER user
21
+
22
+ # HF Spaces expects port 7860
23
+ ENV PORT=7860
24
+ EXPOSE 7860
25
+
26
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,13 +1,33 @@
1
  ---
2
- title: New Datasets in Machine Learning
3
- emoji: 💽
4
- colorFrom: pink
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 4.36.1
8
- app_file: app.py
9
  pinned: false
10
- python_version: 3.11.1
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ArXiv New ML Datasets
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
 
 
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # ArXiv New ML Datasets
12
+
13
+ Browse **1.1M+ CS papers** from arXiv, with **50,000+ classified** as introducing new machine learning datasets.
14
+
15
+ ## Features
16
+
17
+ - **Keyword search** - Search titles and abstracts
18
+ - **Semantic search** - Find conceptually similar papers using vector embeddings
19
+ - **Filter** by arXiv category (cs.AI, cs.CV, cs.LG, etc.)
20
+ - **Infinite scroll** for smooth browsing
21
+ - Links to arXiv, PDF, and HF Papers
22
+
23
+ ## Data Source
24
+
25
+ Papers classified using [ModernBERT](https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset). Embeddings from [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5).
26
+
27
+ Data from [librarian-bots/arxiv-cs-papers-lance](https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance). Updated weekly.
28
+
29
+ ## Tech Stack
30
+
31
+ - **Backend**: FastAPI + Polars + Lance
32
+ - **Frontend**: HTMX + Tailwind CSS
33
+ - **Vector Search**: Lance with IVF_PQ index
app.py CHANGED
@@ -1,205 +1,512 @@
1
- import os
 
 
 
2
 
3
- import arxiv
4
- import gradio as gr
5
- import pandas as pd
6
- from apscheduler.schedulers.background import BackgroundScheduler
7
- from cachetools import TTLCache, cached
8
- from setfit import SetFitModel
9
- from tqdm.auto import tqdm
10
- import stamina
11
- from arxiv import UnexpectedEmptyPageError, ArxivError
12
 
13
- os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 
 
 
 
 
 
 
 
14
 
15
- CACHE_TIME = 60 * 60 * 12 # 12 hours
16
- MAX_RESULTS = 300
17
 
 
 
 
18
 
19
- client = arxiv.Client(page_size=50, delay_seconds=3, num_retries=2)
20
 
 
 
 
 
21
 
22
- @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
23
- def get_arxiv_result():
24
- return _get_arxiv_result()
25
 
 
 
 
 
 
 
 
26
 
27
- @stamina.retry(
28
- on=(ValueError, UnexpectedEmptyPageError, ArxivError), attempts=10, wait_max=60 * 15
29
- )
30
- def _get_arxiv_result():
31
- results = [
32
- {
33
- "title": result.title,
34
- "abstract": result.summary,
35
- "url": result.entry_id,
36
- "category": result.primary_category,
37
- "updated": result.updated,
38
- }
39
- for result in tqdm(
40
- client.results(
41
- arxiv.Search(
42
- query="ti:dataset",
43
- max_results=MAX_RESULTS,
44
- sort_by=arxiv.SortCriterion.SubmittedDate,
45
- )
46
- ),
47
- total=MAX_RESULTS,
 
 
 
 
 
 
48
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  ]
50
- if len(results) > 1:
51
- return results
52
- else:
53
- raise ValueError("No results found")
54
- # return [
55
- # {
56
- # "title": result.title,
57
- # "abstract": result.summary,
58
- # "url": result.entry_id,
59
- # "category": result.primary_category,
60
- # "updated": result.updated,
61
- # }
62
- # for result in tqdm(search.results(), total=MAX_RESULTS)
63
- # ]
64
-
65
-
66
- def load_model():
67
- return SetFitModel.from_pretrained("librarian-bots/is_new_dataset_teacher_model")
68
-
69
-
70
- def format_row_for_model(row):
71
- return f"TITLE: {row['title']} \n\nABSTRACT: {row['abstract']}"
72
-
73
-
74
- int2label = {0: "new_dataset", 1: "not_new_dataset"}
75
-
76
-
77
- def get_predictions(data: list[dict], model=None, batch_size=128):
78
- if model is None:
79
- model = load_model()
80
- predictions = []
81
- for i in tqdm(range(0, len(data), batch_size)):
82
- batch = data[i : i + batch_size]
83
- text_inputs = [format_row_for_model(row) for row in batch]
84
- batch_predictions = model.predict_proba(text_inputs)
85
- for j, row in enumerate(batch):
86
- prediction = batch_predictions[j]
87
- row["prediction"] = int2label[int(prediction.argmax())]
88
- row["probability"] = float(prediction.max())
89
- predictions.append(row)
90
- return predictions
91
-
92
-
93
- def create_markdown(row):
94
- title = row["title"]
95
- abstract = row["abstract"]
96
- arxiv_id = row["arxiv_id"]
97
- hub_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
98
- updated = row["updated"]
99
- updated = updated.strftime("%Y-%m-%d")
100
- broad_category = row["broad_category"]
101
- category = row["category"]
102
- return f""" <h2> {title} </h2> Updated: {updated}
103
- | Category: {broad_category} | Subcategory: {category} |
104
- \n\n{abstract}
105
- \n\n [Hugging Face Papers page]({hub_paper_url})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  """
 
 
 
 
 
 
 
 
 
 
 
 
107
 
 
 
 
 
108
 
109
- @cached(cache=TTLCache(maxsize=10, ttl=CACHE_TIME))
110
- def prepare_data():
111
- print("Downloading arxiv results...")
112
- arxiv_results = get_arxiv_result()
113
- print("loading model...")
114
- model = load_model()
115
- print("Making predictions...")
116
- predictions = get_predictions(arxiv_results, model=model)
117
- df = pd.DataFrame(predictions)
118
- df.loc[:, "arxiv_id"] = df["url"].str.extract(r"(\d+\.\d+)")
119
- df.loc[:, "broad_category"] = df["category"].str.split(".").str[0]
120
- df.loc[:, "markdown"] = df.apply(create_markdown, axis=1)
121
  return df
122
 
123
 
124
- all_possible_arxiv_categories = sorted(prepare_data().category.unique().tolist())
125
- broad_categories = sorted(prepare_data().broad_category.unique().tolist())
126
-
127
-
128
- # @list_cacheable
129
- def create_markdown_summary(categories=None, new_only=True, narrow_categories=None):
130
- df = prepare_data()
131
- if new_only:
132
- df = df[df["prediction"] == "new_dataset"]
133
- if narrow_categories is not None:
134
- df = df[df["category"].isin(narrow_categories)]
135
- if categories is not None and not narrow_categories:
136
- df = prepare_data()
137
- if new_only:
138
- df = df[df["prediction"] == "new_dataset"]
139
- df = df[df["broad_category"].isin(categories)]
140
- number_of_results = len(df)
141
- results = (
142
- "<h1 style='text-align: center'> arXiv papers related to datasets</h1> \n\n"
143
- )
144
- results += f"Number of results: {number_of_results}\n\n"
145
- results += "\n\n<br>".join(df["markdown"].tolist())
146
- return results
 
 
147
 
148
 
149
- scheduler = BackgroundScheduler()
150
- scheduler.add_job(prepare_data, "cron", hour=3, minute=30)
151
- scheduler.start()
 
 
 
 
 
152
 
153
- description = """This Space shows recent papers on arXiv that are *likely* to be papers introducing new datasets related to machine learning. \n\n
154
- The Space works by:
155
- - searching for papers on arXiv with the term `dataset` in the title + "machine learning" in the abstract
156
- - passing the abstract and title of the papers to a machine learning model that predicts if the paper is introducing a new dataset or not
157
-
158
- This Space is a work in progress. The model is not perfect, and the search query is not perfect. If you have suggestions for how to improve this Space, please open a Discussion.\n\n"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
 
 
 
 
 
 
 
160
 
161
- with gr.Blocks() as demo:
162
- gr.Markdown(
163
- "<h1 style='text-align: center'> &#x2728;New Datasets in Machine Learning "
164
- " &#x2728; </h1>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  )
166
- gr.Markdown(description)
167
- with gr.Row():
168
- broad_categories = gr.Dropdown(
169
- choices=broad_categories,
170
- label="Broad arXiv Category",
171
- multiselect=True,
172
- value="cs",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  )
174
- with gr.Accordion("Advanced Options", open=False):
175
- gr.Markdown(
176
- "Narrow by arXiv categories. **Note** this will take precedence over the"
177
- " broad category selection."
178
  )
179
- narrow_categories = gr.Dropdown(
180
- choices=all_possible_arxiv_categories,
181
- value=None,
182
- multiselect=True,
183
- label="Narrow arXiv Category",
 
 
 
 
184
  )
185
- gr.ClearButton(narrow_categories, "Clear Narrow Categories", size="sm")
186
- with gr.Row():
187
- new_only = gr.Checkbox(True, label="New Datasets Only", interactive=True)
188
- results = gr.Markdown(create_markdown_summary())
189
- broad_categories.change(
190
- create_markdown_summary,
191
- inputs=[broad_categories, new_only, narrow_categories],
192
- outputs=results,
193
- )
194
- narrow_categories.change(
195
- create_markdown_summary,
196
- inputs=[broad_categories, new_only, narrow_categories],
197
- outputs=results,
198
- )
199
- new_only.change(
200
- create_markdown_summary,
201
- [broad_categories, new_only, narrow_categories],
202
- results,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- demo.launch()
 
1
+ """
2
+ FastAPI + HTMX app for browsing arxiv papers with new ML datasets.
3
+ Downloads Lance dataset from HuggingFace Hub and loads locally.
4
+ """
5
 
6
+ import re
7
+ from datetime import date, timedelta
8
+ from functools import lru_cache
9
+ from typing import Optional
10
+ from urllib.parse import urlencode
 
 
 
 
11
 
12
+ import lance
13
+ import polars as pl
14
+ from cachetools import TTLCache
15
+ from dotenv import load_dotenv
16
+ from fastapi import FastAPI, Query, Request
17
+ from fastapi.responses import HTMLResponse, RedirectResponse
18
+ from fastapi.staticfiles import StaticFiles
19
+ from fastapi.templating import Jinja2Templates
20
+ from huggingface_hub import snapshot_download
21
+ from markupsafe import Markup
22
 
23
+ # Load .env file for local development (HF_TOKEN)
24
+ load_dotenv()
25
 
26
+ app = FastAPI(title="ArXiv New ML Datasets")
27
+ app.mount("/static", StaticFiles(directory="static"), name="static")
28
+ templates = Jinja2Templates(directory="templates")
29
 
 
30
 
31
+ def highlight_search(text: str, search: str) -> Markup:
32
+ """Highlight search terms in text with yellow background."""
33
+ if not search or not text:
34
+ return Markup(text) if text else Markup("")
35
 
36
+ # Escape HTML in text first
37
+ import html
38
+ text = html.escape(str(text))
39
 
40
+ # Case-insensitive replacement with highlight span
41
+ pattern = re.compile(re.escape(search), re.IGNORECASE)
42
+ highlighted = pattern.sub(
43
+ lambda m: f'<mark class="bg-yellow-200 px-0.5 rounded">{m.group()}</mark>',
44
+ text
45
+ )
46
+ return Markup(highlighted)
47
 
48
+
49
+ # Register custom filter
50
+ templates.env.filters["highlight"] = highlight_search
51
+
52
+ # Dataset config
53
+ DATASET_REPO = "librarian-bots/arxiv-cs-papers-lance"
54
+
55
+ # Cache for dataset (reload every 6 hours)
56
+ _dataset_cache: TTLCache = TTLCache(maxsize=1, ttl=60 * 60 * 6)
57
+
58
+ # Cache for Lance dataset connection (for vector search)
59
+ _lance_cache: dict = {}
60
+
61
+ # Cache for embedding model (lazy loaded on first semantic search)
62
+ _model_cache: dict = {}
63
+
64
+
65
+ def get_lance_dataset():
66
+ """Download dataset from HF Hub (cached) and return Lance connection."""
67
+ if "ds" not in _lance_cache:
68
+ # Use local_dir to get actual files, not symlinks (Lance needs real files)
69
+ local_dir = "./data/arxiv-lance"
70
+ print(f"Downloading dataset from {DATASET_REPO} to {local_dir}...")
71
+ snapshot_download(
72
+ DATASET_REPO,
73
+ repo_type="dataset",
74
+ local_dir=local_dir,
75
  )
76
+ lance_path = f"{local_dir}/data/train.lance"
77
+ print(f"Loading Lance dataset from {lance_path}")
78
+ _lance_cache["ds"] = lance.dataset(lance_path)
79
+ return _lance_cache["ds"]
80
+
81
+
82
+ def get_embedding_model():
83
+ """Load embedding model (cached, lazy-loaded on first semantic search)."""
84
+ if "model" not in _model_cache:
85
+ from sentence_transformers import SentenceTransformer
86
+ print("Loading embedding model...")
87
+ _model_cache["model"] = SentenceTransformer("BAAI/bge-base-en-v1.5")
88
+ print("Embedding model loaded!")
89
+ return _model_cache["model"]
90
+
91
+
92
+ def get_dataframe() -> pl.DataFrame:
93
+ """Load Lance dataset and convert to Polars DataFrame."""
94
+ cache_key = "df"
95
+ if cache_key in _dataset_cache:
96
+ return _dataset_cache[cache_key]
97
+
98
+ ds = get_lance_dataset() # Downloads from HF Hub if not cached
99
+ # Select columns needed for filtering/display (exclude embeddings for memory)
100
+ columns = [
101
+ "id", "title", "abstract", "categories", "update_date",
102
+ "authors", "is_new_dataset", "confidence_score"
103
  ]
104
+ arrow_table = ds.to_table(columns=columns)
105
+ df = pl.from_arrow(arrow_table)
106
+ _dataset_cache[cache_key] = df
107
+ print(f"Loaded {len(df):,} papers")
108
+ return df
109
+
110
+
111
+ @lru_cache(maxsize=1)
112
+ def get_categories() -> list[str]:
113
+ """Get unique category prefixes for filtering."""
114
+ df = get_dataframe()
115
+ # Extract primary category (before first space or as-is)
116
+ categories = (
117
+ df.select(pl.col("categories").str.split(" ").list.first().alias("cat"))
118
+ .unique()
119
+ .sort("cat")
120
+ .to_series()
121
+ .to_list()
122
+ )
123
+ # Get common ML-related categories
124
+ ml_cats = ["cs.AI", "cs.CL", "cs.CV", "cs.LG", "cs.NE", "cs.IR", "cs.RO", "stat.ML"]
125
+ return [c for c in ml_cats if c in categories]
126
+
127
+
128
+ @lru_cache(maxsize=1)
129
+ def get_confidence_counts() -> dict[str, int]:
130
+ """Count papers at each confidence threshold (for Tufte-style filter).
131
+
132
+ Thresholds chosen based on actual data distribution (avg ~70% confidence).
133
+ """
134
+ df = get_dataframe()
135
+ new_datasets = df.filter(pl.col("is_new_dataset"))
136
+ thresholds = [0.5, 0.6, 0.65, 0.7, 0.71]
137
+ return {
138
+ str(t): new_datasets.filter(pl.col("confidence_score") >= t).height
139
+ for t in thresholds
140
+ }
141
+
142
+
143
+ @lru_cache(maxsize=1)
144
+ def get_histogram_data() -> dict:
145
+ """Get confidence distribution data for histogram display.
146
+
147
+ Dynamically determines the range from actual data distribution.
148
+ Returns dict with bins and metadata. The 50% line marks the prediction boundary.
149
+ """
150
+ df = get_dataframe()
151
+
152
+ # Get all papers with confidence scores
153
+ all_papers = df.select("confidence_score", "is_new_dataset")
154
+
155
+ # Dynamically determine the range from actual data
156
+ # Round to nearest 5% for clean boundaries
157
+ actual_min = float(all_papers["confidence_score"].min())
158
+ actual_max = float(all_papers["confidence_score"].max())
159
+
160
+ # Round down to nearest 5% for min, round up for max
161
+ min_pct = max(0, (int(actual_min * 20) / 20)) # Floor to 5%
162
+ max_pct = min(1, ((int(actual_max * 20) + 1) / 20)) # Ceil to 5%
163
+
164
+ # Ensure minimum range of 25% for usability
165
+ if max_pct - min_pct < 0.25:
166
+ center = (min_pct + max_pct) / 2
167
+ min_pct = max(0, center - 0.125)
168
+ max_pct = min(1, center + 0.125)
169
+
170
+ # Use 25 bins for good granularity
171
+ num_bins = 25
172
+ bin_width = (max_pct - min_pct) / num_bins
173
+
174
+ bins = []
175
+ for i in range(num_bins):
176
+ bin_start = min_pct + i * bin_width
177
+ bin_end = min_pct + (i + 1) * bin_width
178
+
179
+ # Count papers in this bin
180
+ count = all_papers.filter(
181
+ (pl.col("confidence_score") >= bin_start) &
182
+ (pl.col("confidence_score") < bin_end)
183
+ ).height
184
+
185
+ # Count new_dataset papers in this bin
186
+ new_dataset_count = all_papers.filter(
187
+ (pl.col("confidence_score") >= bin_start) &
188
+ (pl.col("confidence_score") < bin_end) &
189
+ (pl.col("is_new_dataset"))
190
+ ).height
191
+
192
+ bins.append({
193
+ "bin_start": round(bin_start, 3),
194
+ "bin_end": round(bin_end, 3),
195
+ "bin_pct": int(bin_start * 100),
196
+ "count": count,
197
+ "new_dataset_count": new_dataset_count,
198
+ })
199
+
200
+ # Normalize counts for display (max height = 100%)
201
+ max_count = max(b["count"] for b in bins) if bins else 1
202
+ for b in bins:
203
+ b["height_pct"] = int((b["count"] / max_count) * 100) if max_count > 0 else 0
204
+ b["new_height_pct"] = int((b["new_dataset_count"] / max_count) * 100) if max_count > 0 else 0
205
+
206
+ # Calculate cumulative counts from each threshold
207
+ # (how many papers are at or above this threshold)
208
+ total_so_far = all_papers.height
209
+ for b in bins:
210
+ b["papers_above"] = total_so_far
211
+ total_so_far -= b["count"]
212
+
213
+ return {
214
+ "bins": bins,
215
+ "min_pct": round(min_pct, 2),
216
+ "max_pct": round(max_pct, 2),
217
+ "total_papers": all_papers.height,
218
+ "new_dataset_count": all_papers.filter(pl.col("is_new_dataset")).height,
219
+ }
220
+
221
+
222
+ def parse_since(since: str) -> Optional[date]:
223
+ """Parse 'since' parameter to a date. Returns None for 'all time'."""
224
+ if not since:
225
+ return None
226
+ today = date.today()
227
+ if since == "1m":
228
+ return today - timedelta(days=30)
229
+ elif since == "6m":
230
+ return today - timedelta(days=180)
231
+ elif since == "1y":
232
+ return today - timedelta(days=365)
233
+ return None
234
+
235
+
236
+ def filter_papers(
237
+ df: pl.DataFrame,
238
+ category: Optional[str] = None,
239
+ search: Optional[str] = None,
240
+ min_confidence: float = 0.5,
241
+ since: Optional[str] = None,
242
+ ) -> pl.DataFrame:
243
+ """Apply filters to the papers dataframe.
244
+
245
+ The confidence threshold controls which papers are shown:
246
+ - Papers with is_new_dataset=True have confidence >= 0.5
247
+ - Setting threshold to 0 shows all papers
248
+ - Setting threshold >= 0.5 effectively shows only new_dataset papers
249
  """
250
+ if min_confidence > 0:
251
+ df = df.filter(pl.col("confidence_score") >= min_confidence)
252
+
253
+ if category:
254
+ df = df.filter(pl.col("categories").str.contains(category))
255
+
256
+ if search:
257
+ search_lower = search.lower()
258
+ df = df.filter(
259
+ pl.col("title").str.to_lowercase().str.contains(search_lower)
260
+ | pl.col("abstract").str.to_lowercase().str.contains(search_lower)
261
+ )
262
 
263
+ # Date filter
264
+ min_date = parse_since(since)
265
+ if min_date:
266
+ df = df.filter(pl.col("update_date") >= min_date)
267
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  return df
269
 
270
 
271
+ def paginate_papers(
272
+ df: pl.DataFrame,
273
+ page: int = 1,
274
+ per_page: int = 20,
275
+ sort: str = "date",
276
+ ) -> tuple[pl.DataFrame, bool]:
277
+ """Sort and paginate papers, return (page_df, has_more).
278
+
279
+ Sort options:
280
+ - "date": By update_date desc, then confidence_score desc
281
+ - "relevance": Keep existing order (for semantic search similarity)
282
+ """
283
+ if sort == "date":
284
+ df_sorted = df.sort(
285
+ ["update_date", "confidence_score"], descending=[True, True]
286
+ )
287
+ else:
288
+ # "relevance" - keep existing order (already sorted by similarity for semantic)
289
+ df_sorted = df
290
+
291
+ start = (page - 1) * per_page
292
+ page_df = df_sorted.slice(start, per_page + 1)
293
+ has_more = len(page_df) > per_page
294
+
295
+ return page_df.head(per_page), has_more
296
 
297
 
298
+ def semantic_search(
299
+ query: str,
300
+ k: int = 100,
301
+ category: Optional[str] = None,
302
+ min_confidence: float = 0.5,
303
+ since: Optional[str] = None,
304
+ ) -> pl.DataFrame:
305
+ """Search using vector similarity via Lance nearest neighbor.
306
 
307
+ Returns DataFrame with similarity_score column (0-1, higher is more similar).
308
+ """
309
+ model = get_embedding_model()
310
+ query_embedding = model.encode(query).tolist()
311
+
312
+ ds = get_lance_dataset()
313
+
314
+ # Build SQL filter (Lance supports SQL-like syntax)
315
+ filters = []
316
+ if min_confidence > 0:
317
+ filters.append(f"confidence_score >= {min_confidence}")
318
+ if category:
319
+ # Escape single quotes in category name for SQL safety
320
+ safe_category = category.replace("'", "''")
321
+ filters.append(f"categories LIKE '%{safe_category}%'")
322
+ # Date filter - use TIMESTAMP literal for Lance/DataFusion
323
+ min_date = parse_since(since)
324
+ if min_date:
325
+ filters.append(f"update_date >= TIMESTAMP '{min_date.isoformat()} 00:00:00'")
326
+ filter_str = " AND ".join(filters) if filters else None
327
 
328
+ # Vector search - include _distance for similarity calculation
329
+ results = ds.scanner(
330
+ nearest={"column": "embedding", "q": query_embedding, "k": k},
331
+ filter=filter_str,
332
+ columns=["id", "title", "abstract", "categories", "update_date",
333
+ "authors", "confidence_score", "_distance"]
334
+ ).to_table()
335
 
336
+ df = pl.from_arrow(results)
337
+
338
+ # Convert L2 distance to similarity score (0-1 range)
339
+ # For normalized embeddings: similarity = 1 - distance/2
340
+ # BGE embeddings are normalized, so L2 distance ranges from 0 to 2
341
+ df = df.with_columns(
342
+ (1 - pl.col("_distance") / 2).clip(0, 1).alias("similarity_score")
343
+ ).drop("_distance")
344
+
345
+ return df
346
+
347
+
348
+ @app.get("/", response_class=HTMLResponse)
349
+ async def home(
350
+ request: Request,
351
+ search: Optional[str] = Query(None),
352
+ search_type: str = Query("keyword"),
353
+ category: Optional[str] = Query(None),
354
+ min_confidence: str = Query("0.5"), # String to preserve exact value for template
355
+ since: Optional[str] = Query(None),
356
+ sort: str = Query("date"),
357
+ ):
358
+ """Render the home page with optional initial filter state from URL."""
359
+ df = get_dataframe()
360
+ categories = get_categories()
361
+ histogram_data = get_histogram_data()
362
+
363
+ # Get stats
364
+ total_papers = len(df)
365
+ new_dataset_count = df.filter(pl.col("is_new_dataset")).height
366
+
367
+ return templates.TemplateResponse(
368
+ "index.html",
369
+ {
370
+ "request": request,
371
+ "categories": categories,
372
+ "total_papers": total_papers,
373
+ "new_dataset_count": new_dataset_count,
374
+ "histogram_data": histogram_data,
375
+ # Pass filter state for URL persistence
376
+ "search": search or "",
377
+ "search_type": search_type,
378
+ "category": category or "",
379
+ "min_confidence": min_confidence,
380
+ "since": since or "",
381
+ "sort": sort,
382
+ },
383
  )
384
+
385
+
386
+ @app.get("/papers", response_class=HTMLResponse)
387
+ async def get_papers(
388
+ request: Request,
389
+ page: int = Query(1, ge=1),
390
+ per_page: int = Query(20, ge=1, le=100),
391
+ category: Optional[str] = Query(None),
392
+ search: Optional[str] = Query(None),
393
+ min_confidence: float = Query(0.5, ge=0, le=1),
394
+ search_type: str = Query("keyword"), # "keyword" or "semantic"
395
+ sort: str = Query("date"), # "date" or "relevance"
396
+ since: Optional[str] = Query(None), # "1m", "6m", "1y", or None for all
397
+ ):
398
+ """Get paginated and filtered papers (returns HTML partial for HTMX).
399
+
400
+ If accessed directly (not via HTMX), redirects to home page with same params.
401
+ """
402
+ # Redirect direct browser visits to home page (this endpoint returns partials)
403
+ if "HX-Request" not in request.headers:
404
+ # Build redirect URL with current query params
405
+ query_string = str(request.url.query)
406
+ redirect_url = f"/?{query_string}" if query_string else "/"
407
+ return RedirectResponse(url=redirect_url, status_code=302)
408
+
409
+ if search and search_type == "semantic":
410
+ # Vector search - returns pre-sorted by similarity
411
+ filtered_df = semantic_search(
412
+ query=search,
413
+ k=per_page * 5, # Get more for pagination buffer
414
+ category=category,
415
+ min_confidence=min_confidence,
416
+ since=since,
417
  )
418
+ # Default to relevance sort for semantic, but allow date sort
419
+ effective_sort = sort if sort == "date" else "relevance"
420
+ page_df, has_more = paginate_papers(
421
+ filtered_df, page=page, per_page=per_page, sort=effective_sort
422
  )
423
+ else:
424
+ # Existing keyword search path
425
+ df = get_dataframe()
426
+ filtered_df = filter_papers(
427
+ df,
428
+ category=category,
429
+ search=search,
430
+ min_confidence=min_confidence,
431
+ since=since,
432
  )
433
+ # Keyword search always sorts by date
434
+ page_df, has_more = paginate_papers(
435
+ filtered_df, page=page, per_page=per_page, sort="date"
436
+ )
437
+
438
+ # Convert to list of dicts for template
439
+ papers = page_df.to_dicts()
440
+
441
+ # Build clean URL for browser history (/ instead of /papers)
442
+ # Only include non-default values to keep URLs short
443
+ params = {}
444
+ if search:
445
+ params["search"] = search
446
+ if search_type != "keyword":
447
+ params["search_type"] = search_type
448
+ if category:
449
+ params["category"] = category
450
+ if min_confidence != 0.5:
451
+ params["min_confidence"] = min_confidence
452
+ if since:
453
+ params["since"] = since
454
+ if sort != "date":
455
+ params["sort"] = sort
456
+ push_url = "/?" + urlencode(params) if params else "/"
457
+
458
+ response = templates.TemplateResponse(
459
+ "partials/paper_list.html",
460
+ {
461
+ "request": request,
462
+ "papers": papers,
463
+ "page": page,
464
+ "has_more": has_more,
465
+ "category": category or "",
466
+ "search": search or "",
467
+ "min_confidence": min_confidence,
468
+ "search_type": search_type,
469
+ "sort": sort,
470
+ "since": since or "",
471
+ "total_filtered": len(filtered_df),
472
+ },
473
  )
474
+ # Tell HTMX to push clean URL (/ not /papers)
475
+ response.headers["HX-Push-Url"] = push_url
476
+ return response
477
+
478
+
479
+ @app.get("/api/stats")
480
+ async def get_stats():
481
+ """Get dataset statistics as JSON."""
482
+ df = get_dataframe()
483
+
484
+ new_datasets = df.filter(pl.col("is_new_dataset"))
485
+
486
+ return {
487
+ "total_papers": len(df),
488
+ "new_dataset_count": len(new_datasets),
489
+ "avg_confidence": float(df["confidence_score"].mean()),
490
+ "date_range": {
491
+ "min": str(df["update_date"].min()),
492
+ "max": str(df["update_date"].max()),
493
+ },
494
+ }
495
+
496
+
497
+ # Preload dataset and model on startup
498
+ @app.on_event("startup")
499
+ async def startup_event():
500
+ """Preload dataset and embedding model on startup."""
501
+ print("Preloading dataset...")
502
+ get_dataframe()
503
+ print("Dataset loaded!")
504
+ print("Preloading embedding model...")
505
+ get_embedding_model()
506
+ print("Embedding model loaded!")
507
+
508
+
509
+ if __name__ == "__main__":
510
+ import uvicorn
511
 
512
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt CHANGED
@@ -1,354 +1,11 @@
1
- # This file was autogenerated by uv via the following command:
2
- # uv pip compile requirements.in -o requirements.txt
3
- aiofiles==23.2.1
4
- # via gradio
5
- aiohttp==3.9.5
6
- # via
7
- # datasets
8
- # fsspec
9
- aiosignal==1.3.1
10
- # via aiohttp
11
- altair==5.3.0
12
- # via gradio
13
- annotated-types==0.7.0
14
- # via pydantic
15
- anyio==4.4.0
16
- # via
17
- # httpx
18
- # starlette
19
- # watchfiles
20
- apscheduler==3.10.4
21
- # via -r requirements.in
22
- arxiv==2.1.0
23
- # via -r requirements.in
24
- attrs==23.2.0
25
- # via
26
- # aiohttp
27
- # jsonschema
28
- # referencing
29
- cachetools==5.3.3
30
- # via -r requirements.in
31
- certifi==2024.6.2
32
- # via
33
- # httpcore
34
- # httpx
35
- # requests
36
- charset-normalizer==3.3.2
37
- # via requests
38
- click==8.1.7
39
- # via
40
- # typer
41
- # uvicorn
42
- contourpy==1.2.1
43
- # via matplotlib
44
- cycler==0.12.1
45
- # via matplotlib
46
- datasets==2.14.4
47
- # via
48
- # evaluate
49
- # setfit
50
- dill==0.3.7
51
- # via
52
- # datasets
53
- # evaluate
54
- # multiprocess
55
- dnspython==2.6.1
56
- # via email-validator
57
- email-validator==2.1.1
58
- # via fastapi
59
- evaluate==0.4.2
60
- # via setfit
61
- fastapi==0.111.0
62
- # via gradio
63
- fastapi-cli==0.0.4
64
- # via fastapi
65
- feedparser==6.0.10
66
- # via arxiv
67
- ffmpy==0.3.2
68
- # via gradio
69
- filelock==3.14.0
70
- # via
71
- # huggingface-hub
72
- # torch
73
- # transformers
74
- fonttools==4.53.0
75
- # via matplotlib
76
- frozenlist==1.4.1
77
- # via
78
- # aiohttp
79
- # aiosignal
80
- fsspec==2024.6.0
81
- # via
82
- # datasets
83
- # evaluate
84
- # gradio-client
85
- # huggingface-hub
86
- # torch
87
- gradio==4.36.1
88
- # via -r requirements.in
89
- gradio-client==1.0.1
90
- # via gradio
91
- h11==0.14.0
92
- # via
93
- # httpcore
94
- # uvicorn
95
- hf-transfer==0.1.6
96
- # via -r requirements.in
97
- httpcore==1.0.5
98
- # via httpx
99
- httptools==0.6.1
100
- # via uvicorn
101
- httpx==0.27.0
102
- # via
103
- # fastapi
104
- # gradio
105
- # gradio-client
106
- huggingface-hub==0.23.3
107
- # via
108
- # datasets
109
- # evaluate
110
- # gradio
111
- # gradio-client
112
- # sentence-transformers
113
- # setfit
114
- # tokenizers
115
- # transformers
116
- idna==3.7
117
- # via
118
- # anyio
119
- # email-validator
120
- # httpx
121
- # requests
122
- # yarl
123
- importlib-resources==6.4.0
124
- # via gradio
125
- jinja2==3.1.4
126
- # via
127
- # altair
128
- # fastapi
129
- # gradio
130
- # torch
131
- joblib==1.4.2
132
- # via scikit-learn
133
- jsonschema==4.22.0
134
- # via altair
135
- jsonschema-specifications==2023.12.1
136
- # via jsonschema
137
- kiwisolver==1.4.5
138
- # via matplotlib
139
- markdown-it-py==3.0.0
140
- # via rich
141
- markupsafe==2.1.5
142
- # via
143
- # gradio
144
- # jinja2
145
- matplotlib==3.9.0
146
- # via gradio
147
- mdurl==0.1.2
148
- # via markdown-it-py
149
- mpmath==1.3.0
150
- # via sympy
151
- multidict==6.0.5
152
- # via
153
- # aiohttp
154
- # yarl
155
- multiprocess==0.70.15
156
- # via
157
- # datasets
158
- # evaluate
159
- networkx==3.3
160
- # via torch
161
- numpy==1.26.4
162
- # via
163
- # altair
164
- # contourpy
165
- # datasets
166
- # evaluate
167
- # gradio
168
- # matplotlib
169
- # pandas
170
- # pyarrow
171
- # scikit-learn
172
- # scipy
173
- # sentence-transformers
174
- # transformers
175
- orjson==3.10.3
176
- # via
177
- # fastapi
178
- # gradio
179
- packaging==24.1
180
- # via
181
- # altair
182
- # datasets
183
- # evaluate
184
- # gradio
185
- # gradio-client
186
- # huggingface-hub
187
- # matplotlib
188
- # setfit
189
- # transformers
190
- pandas==2.2.2
191
- # via
192
- # altair
193
- # datasets
194
- # evaluate
195
- # gradio
196
- pillow==10.3.0
197
- # via
198
- # gradio
199
- # matplotlib
200
- # sentence-transformers
201
- pyarrow==16.1.0
202
- # via datasets
203
- pydantic==2.7.3
204
- # via
205
- # fastapi
206
- # gradio
207
- pydantic-core==2.18.4
208
- # via pydantic
209
- pydub==0.25.1
210
- # via gradio
211
- pygments==2.18.0
212
- # via rich
213
- pyparsing==3.1.2
214
- # via matplotlib
215
- python-dateutil==2.9.0.post0
216
- # via
217
- # matplotlib
218
- # pandas
219
- python-dotenv==1.0.1
220
- # via uvicorn
221
- python-multipart==0.0.9
222
- # via
223
- # fastapi
224
- # gradio
225
- pytz==2024.1
226
- # via
227
- # apscheduler
228
- # pandas
229
- pyyaml==6.0.1
230
- # via
231
- # datasets
232
- # gradio
233
- # huggingface-hub
234
- # transformers
235
- # uvicorn
236
- referencing==0.35.1
237
- # via
238
- # jsonschema
239
- # jsonschema-specifications
240
- regex==2024.5.15
241
- # via transformers
242
- requests==2.31.0
243
- # via
244
- # arxiv
245
- # datasets
246
- # evaluate
247
- # huggingface-hub
248
- # transformers
249
- rich==13.7.1
250
- # via typer
251
- rpds-py==0.18.1
252
- # via
253
- # jsonschema
254
- # referencing
255
- ruff==0.4.8
256
- # via gradio
257
- safetensors==0.4.3
258
- # via transformers
259
- scikit-learn==1.2.2
260
- # via
261
- # -r requirements.in
262
- # sentence-transformers
263
- # setfit
264
- scipy==1.13.1
265
- # via
266
- # scikit-learn
267
- # sentence-transformers
268
- semantic-version==2.10.0
269
- # via gradio
270
- sentence-transformers==3.0.1
271
- # via setfit
272
- setfit==1.0.3
273
- # via -r requirements.in
274
- sgmllib3k==1.0.0
275
- # via feedparser
276
- shellingham==1.5.4
277
- # via typer
278
- six==1.16.0
279
- # via
280
- # apscheduler
281
- # python-dateutil
282
- sniffio==1.3.1
283
- # via
284
- # anyio
285
- # httpx
286
- stamina==24.2.0
287
- # via -r requirements.in
288
- starlette==0.37.2
289
- # via fastapi
290
- sympy==1.12.1
291
- # via torch
292
- tenacity==8.3.0
293
- # via stamina
294
- threadpoolctl==3.5.0
295
- # via scikit-learn
296
- tokenizers==0.19.1
297
- # via transformers
298
- tomlkit==0.12.0
299
- # via gradio
300
- toolz==0.12.1
301
- # via altair
302
- torch==2.3.1
303
- # via sentence-transformers
304
- tqdm==4.66.4
305
- # via
306
- # datasets
307
- # evaluate
308
- # huggingface-hub
309
- # sentence-transformers
310
- # transformers
311
- transformers==4.41.2
312
- # via sentence-transformers
313
- typer==0.12.3
314
- # via
315
- # fastapi-cli
316
- # gradio
317
- typing-extensions==4.12.2
318
- # via
319
- # fastapi
320
- # gradio
321
- # gradio-client
322
- # huggingface-hub
323
- # pydantic
324
- # pydantic-core
325
- # torch
326
- # typer
327
- tzdata==2024.1
328
- # via pandas
329
- tzlocal==5.2
330
- # via apscheduler
331
- ujson==5.10.0
332
- # via fastapi
333
- urllib3==2.2.1
334
- # via
335
- # gradio
336
- # requests
337
- uvicorn==0.30.1
338
- # via
339
- # fastapi
340
- # gradio
341
- uvloop==0.19.0
342
- # via uvicorn
343
- watchfiles==0.22.0
344
- # via uvicorn
345
- websockets==11.0.3
346
- # via
347
- # gradio-client
348
- # uvicorn
349
- xxhash==3.4.1
350
- # via
351
- # datasets
352
- # evaluate
353
- yarl==1.9.4
354
- # via aiohttp
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ jinja2
4
+ markupsafe
5
+ polars
6
+ huggingface-hub[hf_transfer]
7
+ python-dotenv
8
+ cachetools
9
+ pyarrow
10
+ pylance>=0.20
11
+ sentence-transformers>=3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
static/styles.css ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Custom styles - most styling is via Tailwind CDN */
2
+
3
+ /* Ensure smooth scrolling */
4
+ html {
5
+ scroll-behavior: smooth;
6
+ }
7
+
8
+ /* Better focus styles */
9
+ :focus-visible {
10
+ outline: 2px solid #3b82f6;
11
+ outline-offset: 2px;
12
+ }
templates/base.html ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>{% block title %}Dataset Papers on ArXiv{% endblock %}</title>
7
+
8
+ <!-- Tailwind CSS -->
9
+ <script src="https://cdn.tailwindcss.com"></script>
10
+
11
+ <!-- HTMX -->
12
+ <script src="https://unpkg.com/htmx.org@1.9.12"></script>
13
+
14
+ <style>
15
+ /* Loading indicator - subtle */
16
+ .htmx-indicator { display: none; }
17
+ .htmx-request .htmx-indicator,
18
+ .htmx-request.htmx-indicator { display: inline; }
19
+
20
+ /* Content fades during load */
21
+ .htmx-request #paper-list { opacity: 0.5; transition: opacity 0.15s; }
22
+ </style>
23
+ </head>
24
+ <body class="bg-white min-h-screen text-gray-900">
25
+ <header class="border-b border-gray-200">
26
+ <div class="max-w-3xl mx-auto px-4 py-6">
27
+ <h1 class="text-xl font-semibold">Dataset Papers on ArXiv</h1>
28
+ <p class="text-sm text-gray-500 mt-1">CS papers predicted to introduce new ML datasets by <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="inline-flex items-center gap-1 text-gray-700 hover:text-blue-600 font-medium">this model <span class="text-base">🤗</span></a></p>
29
+ </div>
30
+ </header>
31
+
32
+ <main class="max-w-3xl mx-auto px-4 py-6">
33
+ {% block content %}{% endblock %}
34
+ </main>
35
+
36
+ <footer class="border-t border-gray-100 mt-12">
37
+ <div class="max-w-3xl mx-auto px-4 py-4 text-gray-400 text-xs">
38
+ <a href="https://huggingface.co/datasets/davanstrien/my-classified-papers" class="hover:text-gray-600">Data source</a>
39
+ <span class="mx-2">·</span>
40
+ <a href="https://huggingface.co/davanstrien/ModernBERT-base-is-new-arxiv-dataset" class="hover:text-gray-600">Model</a>
41
+ </div>
42
+ </footer>
43
+ </body>
44
+ </html>
templates/index.html ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+
3
+ {% block content %}
4
+ <div>
5
+ <!-- Powered by -->
6
+ <div class="text-xs text-gray-400 mb-4">
7
+ Vector search powered by <a href="https://lancedb.github.io/lance/" class="underline hover:text-gray-600">Lance</a>
8
+ &middot; Updated weekly via <a href="https://huggingface.co/docs/hub/en/spaces-run-with-hf-jobs" class="underline hover:text-gray-600">HF Jobs</a>
9
+ &middot; <a href="https://huggingface.co/datasets/librarian-bots/arxiv-cs-papers-lance" class="underline hover:text-gray-600">Dataset</a>
10
+ </div>
11
+
12
+ <!-- Stats - minimal -->
13
+ <div class="flex items-baseline gap-2 mb-6">
14
+ <span class="text-3xl font-semibold text-gray-900">{{ "{:,}".format(new_dataset_count) }}</span>
15
+ <span class="text-gray-500">papers with new datasets</span>
16
+ <span class="text-gray-400 text-sm ml-auto">from {{ "{:,}".format(total_papers) }} total</span>
17
+ </div>
18
+
19
+ <!-- Filters - sticky on scroll -->
20
+ <div class="sticky top-0 z-10 bg-white flex flex-wrap items-center gap-4 py-4 border-b border-gray-200 mb-6">
21
+ <!-- Search -->
22
+ <input type="search"
23
+ name="search"
24
+ id="search-input"
25
+ placeholder="Search..."
26
+ value="{{ search }}"
27
+ class="flex-1 min-w-48 px-3 py-2 border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none"
28
+ hx-get="/papers"
29
+ hx-trigger="input changed delay:500ms, keyup[key=='Enter'], histogramChange"
30
+ hx-target="#paper-list"
31
+ hx-include="#filter-form, #category-select, #confidence-filter, #since-filter, #sort-select, #search-type-toggle"
32
+ hx-indicator="#loading-indicator"
33
+ hx-push-url="true">
34
+
35
+ <!-- Search mode toggle -->
36
+ <div id="search-type-toggle" class="flex items-center gap-2 text-xs text-gray-500">
37
+ <label class="flex items-center gap-1 cursor-pointer">
38
+ <input type="radio" name="search_type" value="keyword" {% if search_type == 'keyword' %}checked{% endif %}
39
+ class="h-3 w-3"
40
+ hx-get="/papers"
41
+ hx-trigger="change"
42
+ hx-target="#paper-list"
43
+ hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #sort-select"
44
+ hx-indicator="#loading-indicator"
45
+ hx-push-url="true">
46
+ <span>Keyword</span>
47
+ </label>
48
+ <label class="flex items-center gap-1 cursor-pointer">
49
+ <input type="radio" name="search_type" value="semantic" {% if search_type == 'semantic' %}checked{% endif %}
50
+ class="h-3 w-3"
51
+ hx-get="/papers"
52
+ hx-trigger="change"
53
+ hx-target="#paper-list"
54
+ hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #sort-select"
55
+ hx-indicator="#loading-indicator"
56
+ hx-push-url="true">
57
+ <span>Semantic</span>
58
+ </label>
59
+ </div>
60
+
61
+ <!-- Category filter -->
62
+ <select name="category"
63
+ id="category-select"
64
+ class="px-3 py-2 border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-700"
65
+ hx-get="/papers"
66
+ hx-trigger="change"
67
+ hx-target="#paper-list"
68
+ hx-include="#filter-form, #search-input, #confidence-filter, #since-filter, #sort-select, #search-type-toggle"
69
+ hx-indicator="#loading-indicator"
70
+ hx-push-url="true">
71
+ <option value="">All categories</option>
72
+ {% for cat in categories %}
73
+ <option value="{{ cat }}" {% if category == cat %}selected{% endif %}>{{ cat }}</option>
74
+ {% endfor %}
75
+ </select>
76
+
77
+ <!-- Confidence filter dropdown -->
78
+ <select name="min_confidence"
79
+ id="confidence-filter"
80
+ class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500 ml-auto"
81
+ hx-get="/papers"
82
+ hx-trigger="change"
83
+ hx-target="#paper-list"
84
+ hx-include="#filter-form, #search-input, #category-select, #since-filter, #sort-select, #search-type-toggle"
85
+ hx-indicator="#loading-indicator"
86
+ hx-push-url="true">
87
+ <option value="0.5" {% if min_confidence == '0.5' %}selected{% endif %}>New datasets only</option>
88
+ <option value="0.6" {% if min_confidence == '0.6' %}selected{% endif %}>Higher confidence</option>
89
+ <option value="0" {% if min_confidence == '0' %}selected{% endif %}>All papers</option>
90
+ </select>
91
+
92
+ <!-- Since filter dropdown -->
93
+ <select name="since"
94
+ id="since-filter"
95
+ class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500"
96
+ hx-get="/papers"
97
+ hx-trigger="change"
98
+ hx-target="#paper-list"
99
+ hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #sort-select, #search-type-toggle"
100
+ hx-indicator="#loading-indicator"
101
+ hx-push-url="true">
102
+ <option value="" {% if not since %}selected{% endif %}>All time</option>
103
+ <option value="1m" {% if since == '1m' %}selected{% endif %}>Past month</option>
104
+ <option value="6m" {% if since == '6m' %}selected{% endif %}>Past 6 months</option>
105
+ <option value="1y" {% if since == '1y' %}selected{% endif %}>Past year</option>
106
+ </select>
107
+
108
+ <!-- Sort dropdown -->
109
+ <select name="sort"
110
+ id="sort-select"
111
+ class="px-2 py-1 text-xs border-b border-gray-300 bg-transparent focus:border-gray-900 focus:outline-none text-gray-500"
112
+ hx-get="/papers"
113
+ hx-trigger="change"
114
+ hx-target="#paper-list"
115
+ hx-include="#filter-form, #search-input, #category-select, #confidence-filter, #since-filter, #search-type-toggle"
116
+ hx-indicator="#loading-indicator"
117
+ hx-push-url="true">
118
+ <option value="date" {% if sort == 'date' %}selected{% endif %}>Newest first</option>
119
+ <option value="relevance" {% if sort == 'relevance' %}selected{% endif %}>Relevance</option>
120
+ </select>
121
+
122
+ <!-- Loading indicator - subtle -->
123
+ <span id="loading-indicator" class="htmx-indicator text-sm text-gray-400">Loading...</span>
124
+
125
+ <!-- Hidden form for hx-include -->
126
+ <form id="filter-form" class="hidden"></form>
127
+ </div>
128
+
129
+ <!-- Paper list -->
130
+ <div id="paper-list"
131
+ hx-get="/papers?{% if search %}search={{ search|urlencode }}&{% endif %}search_type={{ search_type }}&{% if category %}category={{ category|urlencode }}&{% endif %}min_confidence={{ min_confidence }}&{% if since %}since={{ since }}&{% endif %}sort={{ sort }}"
132
+ hx-trigger="load"
133
+ hx-indicator="#loading-indicator">
134
+ <div class="py-8 text-gray-400 text-sm">Loading papers...</div>
135
+ </div>
136
+ </div>
137
+ {% endblock %}
templates/partials/paper_card.html ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <article class="py-5 border-b border-gray-200">
2
+ <!-- Title with paper icon -->
3
+ <h3 class="text-lg font-semibold text-gray-900 leading-tight">
4
+ <a href="https://huggingface.co/papers/{{ paper.id }}"
5
+ target="_blank"
6
+ class="hover:text-blue-600 inline-flex items-start gap-2 group">
7
+ <svg class="w-5 h-5 mt-0.5 text-gray-400 group-hover:text-blue-500 flex-shrink-0" fill="none" stroke="currentColor" viewBox="0 0 24 24">
8
+ <path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"></path>
9
+ </svg>
10
+ <span>{% if search %}{{ paper.title|highlight(search) }}{% else %}{{ paper.title }}{% endif %}</span>
11
+ </a>
12
+ </h3>
13
+
14
+ <!-- Meta info - inline with category badge -->
15
+ {% set category = paper.categories.split(' ')[0] if paper.categories else '' %}
16
+ {% set cat_colors = {
17
+ 'cs.CV': 'bg-purple-100 text-purple-700',
18
+ 'cs.AI': 'bg-blue-100 text-blue-700',
19
+ 'cs.LG': 'bg-green-100 text-green-700',
20
+ 'cs.CL': 'bg-orange-100 text-orange-700',
21
+ 'cs.NE': 'bg-pink-100 text-pink-700'
22
+ } %}
23
+ {% set badge_class = cat_colors.get(category, 'bg-gray-100 text-gray-600') %}
24
+ <div class="mt-2 flex items-center gap-3 text-sm text-gray-500">
25
+ <span class="px-2 py-0.5 rounded-full text-xs font-medium {{ badge_class }}">{{ category }}</span>
26
+ <span>{{ paper.update_date.strftime('%Y-%m-%d') if paper.update_date else 'Unknown' }}</span>
27
+ {% if search_type == 'semantic' and paper.similarity_score is defined %}
28
+ <span class="text-blue-500 inline-flex items-center gap-1">
29
+ {{ "%.0f"|format(paper.similarity_score * 100) }}% match
30
+ <span class="cursor-help" title="How similar this paper is to your search query">
31
+ <svg class="w-3.5 h-3.5 text-blue-400" fill="none" stroke="currentColor" viewBox="0 0 24 24">
32
+ <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
33
+ <path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
34
+ <circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
35
+ </svg>
36
+ </span>
37
+ </span>
38
+ <span class="text-gray-400 inline-flex items-center gap-1">
39
+ {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
40
+ <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
41
+ <svg class="w-3.5 h-3.5 text-gray-300" fill="none" stroke="currentColor" viewBox="0 0 24 24">
42
+ <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
43
+ <path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
44
+ <circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
45
+ </svg>
46
+ </span>
47
+ </span>
48
+ {% else %}
49
+ <span class="{% if paper.confidence_score < 0.8 %}text-gray-400{% else %}text-gray-500{% endif %} inline-flex items-center gap-1">
50
+ {{ "%.0f"|format(paper.confidence_score * 100) }}% conf.
51
+ <span class="cursor-help" title="Model confidence this paper introduces a new dataset">
52
+ <svg class="w-3.5 h-3.5 {% if paper.confidence_score < 0.8 %}text-gray-300{% else %}text-gray-400{% endif %}" fill="none" stroke="currentColor" viewBox="0 0 24 24">
53
+ <circle cx="12" cy="12" r="10" stroke-width="1.5"></circle>
54
+ <path stroke-linecap="round" stroke-width="1.5" d="M12 16v-1m0-3a2 2 0 10-2-2"></path>
55
+ <circle cx="12" cy="16" r="0.5" fill="currentColor"></circle>
56
+ </svg>
57
+ </span>
58
+ </span>
59
+ {% endif %}
60
+ </div>
61
+
62
+ <!-- Abstract (truncated) -->
63
+ <p class="mt-2 text-gray-600 text-sm leading-relaxed">
64
+ {% if search %}
65
+ {{ paper.abstract[:400]|highlight(search) }}{% if paper.abstract|length > 400 %}...{% endif %}
66
+ {% else %}
67
+ {{ paper.abstract[:400] }}{% if paper.abstract|length > 400 %}...{% endif %}
68
+ {% endif %}
69
+ </p>
70
+ </article>
templates/partials/paper_list.html ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Paper count - subtle -->
2
+ <div class="text-xs text-gray-400 mb-4">
3
+ {{ "{:,}".format(total_filtered) }} results{% if search %} for "{{ search }}"{% if search_type == 'semantic' %} <span class="text-blue-400">(semantic)</span>{% endif %}{% endif %}{% if category %} in {{ category }}{% endif %}
4
+ </div>
5
+
6
+ <!-- Paper cards -->
7
+ <div>
8
+ {% for paper in papers %}
9
+ {% include "partials/paper_card.html" %}
10
+ {% endfor %}
11
+ </div>
12
+
13
+ {% if papers|length == 0 %}
14
+ <div class="py-12 text-gray-400 text-sm">
15
+ No papers found. Try adjusting your filters.
16
+ </div>
17
+ {% endif %}
18
+
19
+ <!-- Infinite scroll trigger -->
20
+ {% if has_more %}
21
+ <div hx-get="/papers?page={{ page + 1 }}&category={{ category }}&search={{ search }}&min_confidence={{ min_confidence }}&search_type={{ search_type }}&sort={{ sort }}&since={{ since }}"
22
+ hx-trigger="revealed"
23
+ hx-swap="outerHTML"
24
+ class="py-6 text-center text-xs text-gray-400">
25
+ <noscript>
26
+ <a href="/papers?page={{ page + 1 }}" class="hover:text-gray-600">Load more</a>
27
+ </noscript>
28
+ </div>
29
+ {% else %}
30
+ <div class="py-6 text-center text-xs text-gray-300">
31
+ End of results
32
+ </div>
33
+ {% endif %}