hysts HF Staff commited on
Commit
78d77c9
·
1 Parent(s): ae479ca
Files changed (9) hide show
  1. .pre-commit-config.yaml +3 -3
  2. README.md +1 -1
  3. app.py +104 -76
  4. papers.py +0 -153
  5. pyproject.toml +13 -7
  6. requirements.txt +125 -352
  7. search.py +34 -0
  8. table.py +38 -0
  9. uv.lock +0 -0
.pre-commit-config.yaml CHANGED
@@ -1,6 +1,6 @@
1
  repos:
2
  - repo: https://github.com/pre-commit/pre-commit-hooks
3
- rev: v5.0.0
4
  hooks:
5
  - id: check-executables-have-shebangs
6
  - id: check-json
@@ -14,13 +14,13 @@ repos:
14
  - id: requirements-txt-fixer
15
  - id: trailing-whitespace
16
  - repo: https://github.com/astral-sh/ruff-pre-commit
17
- rev: v0.8.6
18
  hooks:
19
  - id: ruff
20
  args: ["--fix"]
21
  - id: ruff-format
22
  - repo: https://github.com/pre-commit/mirrors-mypy
23
- rev: v1.14.1
24
  hooks:
25
  - id: mypy
26
  args: ["--ignore-missing-imports"]
 
1
  repos:
2
  - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v6.0.0
4
  hooks:
5
  - id: check-executables-have-shebangs
6
  - id: check-json
 
14
  - id: requirements-txt-fixer
15
  - id: trailing-whitespace
16
  - repo: https://github.com/astral-sh/ruff-pre-commit
17
+ rev: v0.14.10
18
  hooks:
19
  - id: ruff
20
  args: ["--fix"]
21
  - id: ruff-format
22
  - repo: https://github.com/pre-commit/mirrors-mypy
23
+ rev: v1.19.1
24
  hooks:
25
  - id: mypy
26
  args: ["--ignore-missing-imports"]
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 📊
4
  colorFrom: pink
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.39.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
 
4
  colorFrom: pink
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 6.1.0
8
  app_file: app.py
9
  pinned: true
10
  license: mit
app.py CHANGED
@@ -3,115 +3,143 @@
3
  import datetime
4
 
5
  import gradio as gr
6
- import pandas as pd
7
- from apscheduler.schedulers.background import BackgroundScheduler
8
 
9
- from papers import PaperList, get_df
 
10
 
11
  DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
12
 
13
- FOOT_NOTE = """\
14
- Related useful Spaces:
15
- - [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
16
- - [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
17
- - [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
18
- - [dailypapershackernews](https://huggingface.co/spaces/akhaliq/dailypapershackernews) by [akhaliq](https://huggingface.co/akhaliq)
19
- """
20
-
21
-
22
- paper_list = PaperList(get_df())
23
-
24
-
25
- def update_paper_list() -> None:
26
- global paper_list # noqa: PLW0603
27
- paper_list = PaperList(get_df())
28
-
29
-
30
- scheduler = BackgroundScheduler()
31
- scheduler.add_job(func=update_paper_list, trigger="cron", hour="*", timezone="UTC", misfire_grace_time=60)
32
- scheduler.start()
33
-
34
-
35
- def update_df() -> gr.Dataframe:
36
- return gr.Dataframe(value=paper_list.df_prettified)
37
-
38
-
39
- def update_num_papers(df: pd.DataFrame) -> str:
40
- return f"{len(df)} / {len(paper_list.df_raw)}"
41
-
42
-
43
- def search(
 
 
 
 
 
 
 
 
44
  start_date: datetime.datetime,
45
  end_date: datetime.datetime,
46
- search_title: str,
47
- search_abstract: str,
48
- max_num_to_retrieve: int,
49
- ) -> pd.DataFrame:
50
- return paper_list.search(start_date, end_date, search_title, search_abstract, max_num_to_retrieve)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
 
53
- with gr.Blocks(css_paths="style.css") as demo:
54
  gr.Markdown(DESCRIPTION)
55
- with gr.Group():
56
- search_title = gr.Textbox(label="Search title")
57
- with gr.Row():
58
- with gr.Column(scale=4):
59
- search_abstract = gr.Textbox(
60
- label="Search abstract",
61
- info="The result may not be accurate as the abstract does not contain all the information.",
62
- )
63
- with gr.Column(scale=1):
64
- max_num_to_retrieve = gr.Slider(
65
- label="Max number to retrieve",
66
- info="This is used only for search on abstracts.",
67
- minimum=1,
68
- maximum=len(paper_list.df_raw),
69
- step=1,
70
- value=100,
71
- )
72
  with gr.Row():
73
- start_date = gr.DateTime(label="Start date", type="datetime", value="2023-05-05", include_time=False)
74
- end_date = gr.DateTime(label="End date", type="datetime", include_time=False)
 
 
 
 
 
 
 
 
 
75
 
76
- num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
77
  df = gr.Dataframe(
78
- value=paper_list.df_prettified,
79
- datatype=paper_list.column_datatype,
80
- type="pandas",
 
 
81
  interactive=False,
82
  max_height=1000,
83
  elem_id="table",
84
- column_widths=["10%", "10%", "60%", "10%", "5%", "5%"],
85
- wrap=True,
86
  )
87
 
88
- gr.Markdown(FOOT_NOTE)
89
-
 
 
 
 
 
90
  gr.on(
91
- triggers=[start_date.change, end_date.change, search_title.submit, search_abstract.submit],
92
- fn=search,
93
- inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
94
  outputs=df,
95
- api_name=False,
96
  ).then(
97
  fn=update_num_papers,
98
  inputs=df,
99
  outputs=num_papers,
100
  queue=False,
101
- api_name=False,
102
  )
103
  demo.load(
104
  fn=update_df,
 
105
  outputs=df,
106
- queue=False,
107
- api_name=False,
108
  ).then(
109
  fn=update_num_papers,
110
  inputs=df,
111
  outputs=num_papers,
112
  queue=False,
113
- api_name=False,
114
  )
115
 
 
116
  if __name__ == "__main__":
117
- demo.queue(api_open=False).launch(show_api=False)
 
3
  import datetime
4
 
5
  import gradio as gr
6
+ import polars as pl
 
7
 
8
+ from search import search
9
+ from table import df_orig
10
 
11
  DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
12
 
13
+ df_main = df_orig.select(
14
+ "date_md",
15
+ "paper_page_md",
16
+ "title",
17
+ "github_md",
18
+ "upvotes",
19
+ "num_comments",
20
+ "arxiv_id",
21
+ "date",
22
+ )
23
+ df_main = df_main.rename(
24
+ {
25
+ "date_md": "Date",
26
+ "title": "Title",
27
+ "paper_page_md": "Paper page",
28
+ "github_md": "GitHub",
29
+ "upvotes": "👍",
30
+ "num_comments": "💬",
31
+ }
32
+ )
33
+
34
+ COLUMN_INFO = {
35
+ "Date": ("markdown", "10%"),
36
+ "Paper page": ("markdown", "10%"),
37
+ "Title": ("str", "55%"),
38
+ "GitHub": ("markdown", "5%"),
39
+ "👍": ("number", "5%"),
40
+ "💬": ("number", "5%"),
41
+ }
42
+
43
+
44
+ def update_num_papers(df: pl.DataFrame) -> str:
45
+ return f"{len(df)} / {len(df_main)}"
46
+
47
+
48
+ def update_df(
49
+ search_query: str,
50
+ candidate_pool_size: int,
51
+ num_results: int,
52
  start_date: datetime.datetime,
53
  end_date: datetime.datetime,
54
+ ) -> dict:
55
+ if num_results > candidate_pool_size:
56
+ raise gr.Error("Number of results must be less than or equal to candidate pool size", print_exception=False)
57
+
58
+ df = df_main.clone()
59
+ if start_date:
60
+ df = df.filter(pl.col("date") >= start_date)
61
+ if end_date:
62
+ df = df.filter(pl.col("date") <= end_date)
63
+
64
+ if search_query:
65
+ results = search(search_query, candidate_pool_size, num_results)
66
+ if not results:
67
+ df = df.head(0)
68
+ else:
69
+ df = pl.DataFrame(results).join(df, on="arxiv_id", how="inner")
70
+ df = df.sort("ce_score", descending=True).drop("ce_score")
71
+
72
+ columns = list(COLUMN_INFO.keys())
73
+ df = df.select(columns)
74
+ return gr.Dataframe(
75
+ value=df,
76
+ datatype=[COLUMN_INFO[col][0] for col in columns],
77
+ column_widths=[COLUMN_INFO[col][1] for col in columns],
78
+ )
79
 
80
 
81
+ with gr.Blocks() as demo:
82
  gr.Markdown(DESCRIPTION)
83
+ search_query = gr.Textbox(label="Search", submit_btn=True, show_label=False, placeholder="Search...")
84
+ with gr.Accordion(label="Search Options", open=True) as advanced_search_options:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  with gr.Row():
86
+ candidate_pool_size = gr.Slider(label="Candidate Pool Size", minimum=1, maximum=600, step=1, value=200)
87
+ num_results = gr.Slider(label="Number of Results", minimum=1, maximum=400, step=1, value=100)
88
+ with gr.Row():
89
+ start_date = gr.DateTime(
90
+ label="Start Date", value=df_orig.select(pl.col("date").min()).item(), type="datetime", include_time=False
91
+ )
92
+ end_date = gr.DateTime(
93
+ label="End Date", value=df_orig.select(pl.col("date").max()).item(), type="datetime", include_time=False
94
+ )
95
+
96
+ num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(df_orig), interactive=False)
97
 
 
98
  df = gr.Dataframe(
99
+ value=df_main,
100
+ datatype=list(COLUMN_INFO.values()),
101
+ type="polars",
102
+ row_count=(0, "dynamic"),
103
+ show_row_numbers=True,
104
  interactive=False,
105
  max_height=1000,
106
  elem_id="table",
107
+ column_widths=[COLUMN_INFO[col][1] for col in COLUMN_INFO],
 
108
  )
109
 
110
+ inputs = [
111
+ search_query,
112
+ candidate_pool_size,
113
+ num_results,
114
+ start_date,
115
+ end_date,
116
+ ]
117
  gr.on(
118
+ triggers=[search_query.submit, start_date.change, end_date.change],
119
+ fn=update_df,
120
+ inputs=inputs,
121
  outputs=df,
122
+ api_visibility="private",
123
  ).then(
124
  fn=update_num_papers,
125
  inputs=df,
126
  outputs=num_papers,
127
  queue=False,
128
+ api_visibility="private",
129
  )
130
  demo.load(
131
  fn=update_df,
132
+ inputs=inputs,
133
  outputs=df,
134
+ api_visibility="private",
 
135
  ).then(
136
  fn=update_num_papers,
137
  inputs=df,
138
  outputs=num_papers,
139
  queue=False,
140
+ api_visibility="private",
141
  )
142
 
143
+
144
  if __name__ == "__main__":
145
+ demo.launch(css_paths="style.css", footer_links=["gradio", "settings"])
papers.py DELETED
@@ -1,153 +0,0 @@
1
- import datetime
2
- import operator
3
-
4
- import datasets
5
- import pandas as pd
6
- import tqdm.auto
7
- from apscheduler.schedulers.background import BackgroundScheduler
8
- from huggingface_hub import HfApi
9
- from ragatouille import RAGPretrainedModel
10
-
11
- api = HfApi()
12
-
13
- INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
14
- INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
15
- api.snapshot_download(
16
- repo_id=INDEX_REPO_ID,
17
- repo_type="dataset",
18
- local_dir=INDEX_DIR_PATH,
19
- )
20
- abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
21
- # Run once to initialize the retriever
22
- abstract_retriever.search("LLM")
23
-
24
-
25
- def update_abstract_index() -> None:
26
- global abstract_retriever # noqa: PLW0603
27
-
28
- api.snapshot_download(
29
- repo_id=INDEX_REPO_ID,
30
- repo_type="dataset",
31
- local_dir=INDEX_DIR_PATH,
32
- )
33
- abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
34
- abstract_retriever.search("LLM")
35
-
36
-
37
- scheduler = BackgroundScheduler()
38
- scheduler.add_job(func=update_abstract_index, trigger="cron", hour="*", timezone="UTC", misfire_grace_time=3 * 60)
39
- scheduler.start()
40
-
41
-
42
- def get_df() -> pd.DataFrame:
43
- df = (
44
- datasets.load_dataset("hysts-bot-data/daily-papers", split="train")
45
- .to_pandas()
46
- .merge(
47
- datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
48
- on="arxiv_id",
49
- )
50
- )
51
- df = df[::-1].reset_index(drop=True)
52
- df["date"] = df["date"].dt.strftime("%Y-%m-%d")
53
- df = df.drop(columns=["authors", "abstract"])
54
-
55
- paper_info = []
56
- for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
57
- info = row.copy()
58
- info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
59
- paper_info.append(info)
60
- return pd.DataFrame(paper_info)
61
-
62
-
63
- class Prettifier:
64
- @staticmethod
65
- def get_github_link(link: str) -> str:
66
- if not link:
67
- return ""
68
- return Prettifier.create_link("github", link)
69
-
70
- @staticmethod
71
- def create_link(text: str, url: str) -> str:
72
- return f'<a href="{url}" target="_blank">{text}</a>'
73
-
74
- @staticmethod
75
- def to_div(text: str | None, category_name: str) -> str:
76
- if text is None:
77
- text = ""
78
- class_name = f"{category_name}-{text.lower()}"
79
- return f'<div class="{class_name}">{text}</div>'
80
-
81
- def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
82
- new_rows = []
83
- for _, row in df.iterrows():
84
- new_row = {
85
- "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
86
- "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
87
- "title": row["title"],
88
- "github": self.get_github_link(row.github),
89
- "👍": row["upvotes"],
90
- "💬": row["num_comments"],
91
- }
92
- new_rows.append(new_row)
93
- return pd.DataFrame(new_rows)
94
-
95
-
96
- class PaperList:
97
- COLUMN_INFO = (
98
- ("date", "markdown"),
99
- ("paper_page", "markdown"),
100
- ("title", "str"),
101
- ("github", "markdown"),
102
- ("👍", "number"),
103
- ("💬", "number"),
104
- )
105
-
106
- def __init__(self, df: pd.DataFrame) -> None:
107
- self.df_raw = df
108
- self._prettifier = Prettifier()
109
- self.df_prettified = self._prettifier(df).loc[:, self.column_names]
110
-
111
- @property
112
- def column_names(self) -> list[str]:
113
- return list(map(operator.itemgetter(0), self.COLUMN_INFO))
114
-
115
- @property
116
- def column_datatype(self) -> list[str]:
117
- return list(map(operator.itemgetter(1), self.COLUMN_INFO))
118
-
119
- def search(
120
- self,
121
- start_date: datetime.datetime,
122
- end_date: datetime.datetime,
123
- title_search_query: str,
124
- abstract_search_query: str,
125
- max_num_to_retrieve: int,
126
- ) -> pd.DataFrame:
127
- df = self.df_raw.copy()
128
- df["date"] = pd.to_datetime(df["date"])
129
-
130
- # Filter by date
131
- df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
132
- df["date"] = df["date"].dt.strftime("%Y-%m-%d")
133
-
134
- # Filter by title
135
- df = df[df["title"].str.contains(title_search_query, case=False)]
136
-
137
- # Filter by abstract
138
- if abstract_search_query:
139
- results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
140
- remaining_ids = set(df["arxiv_id"])
141
- found_id_set = set()
142
- found_ids = []
143
- for x in results:
144
- arxiv_id = x["document_id"]
145
- if arxiv_id not in remaining_ids:
146
- continue
147
- if arxiv_id in found_id_set:
148
- continue
149
- found_id_set.add(arxiv_id)
150
- found_ids.append(arxiv_id)
151
- df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
152
-
153
- return self._prettifier(df).loc[:, self.column_names]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml CHANGED
@@ -5,12 +5,13 @@ description = ""
5
  readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
8
- "apscheduler>=3.11.0",
9
- "datasets>=4.0.0",
10
- "gradio>=5.39.0",
11
- "hf-transfer>=0.1.9",
12
- "ragatouille>=0.0.8.post4",
13
- "setuptools>=75.6.0",
 
14
  ]
15
 
16
  [tool.ruff]
@@ -35,7 +36,6 @@ ignore = [
35
  "EM101", # raw-string-in-exception
36
  "FBT001", # boolean-type-hint-positional-argument
37
  "FBT002", # boolean-default-value-positional-argument
38
- "PD901", # pandas-df-variable-name
39
  "PGH003", # blanket-type-ignore
40
  "PLR0913", # too-many-arguments
41
  "PLR0915", # too-many-statements
@@ -53,3 +53,9 @@ convention = "google"
53
 
54
  [tool.ruff.format]
55
  docstring-code-format = true
 
 
 
 
 
 
 
5
  readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
8
+ "datasets>=4.4.1",
9
+ "faiss-cpu>=1.13.1",
10
+ "gradio>=6.1.0",
11
+ "polars>=1.36.1",
12
+ "sentence-transformers>=5.2.0",
13
+ "spaces>=0.44.0",
14
+ "torch==2.8.0",
15
  ]
16
 
17
  [tool.ruff]
 
36
  "EM101", # raw-string-in-exception
37
  "FBT001", # boolean-type-hint-positional-argument
38
  "FBT002", # boolean-default-value-positional-argument
 
39
  "PGH003", # blanket-type-ignore
40
  "PLR0913", # too-many-arguments
41
  "PLR0915", # too-many-statements
 
53
 
54
  [tool.ruff.format]
55
  docstring-code-format = true
56
+
57
+ [dependency-groups]
58
+ dev = [
59
+ "pre-commit>=4.5.1",
60
+ "ruff>=0.14.10",
61
+ ]
requirements.txt CHANGED
@@ -1,141 +1,93 @@
1
  # This file was autogenerated by uv via the following command:
2
  # uv pip compile pyproject.toml -o requirements.txt
3
- aiofiles==23.2.1
4
  # via gradio
5
- aiohappyeyeballs==2.4.4
6
  # via aiohttp
7
- aiohttp==3.11.11
8
- # via
9
- # fsspec
10
- # langchain
11
- # llama-index-core
12
- aiosignal==1.3.2
13
  # via aiohttp
 
 
14
  annotated-types==0.7.0
15
  # via pydantic
16
- anyio==4.8.0
17
  # via
18
  # gradio
19
  # httpx
20
- # openai
21
  # starlette
22
- apscheduler==3.11.0
23
- # via daily-papers (pyproject.toml)
24
- async-timeout==4.0.3
25
- # via
26
- # aiohttp
27
- # langchain
28
- attrs==24.3.0
29
  # via aiohttp
30
- beautifulsoup4==4.12.3
31
- # via llama-index-readers-file
32
- bitarray==3.0.0
33
- # via colbert-ai
34
- blinker==1.9.0
35
- # via flask
36
- brotli==1.1.0
37
  # via gradio
38
- catalogue==2.0.10
39
- # via srsly
40
- certifi==2024.12.14
41
  # via
42
  # httpcore
43
  # httpx
44
- # llama-cloud
45
  # requests
46
- charset-normalizer==3.4.1
47
  # via requests
48
- click==8.1.8
49
  # via
50
- # flask
51
- # llama-parse
52
- # nltk
53
  # typer
54
  # uvicorn
55
- colbert-ai==0.2.19
56
- # via ragatouille
57
- dataclasses-json==0.6.7
58
- # via llama-index-core
59
- datasets==4.0.0
60
- # via
61
- # daily-papers (pyproject.toml)
62
- # colbert-ai
63
- deprecated==1.2.15
64
- # via llama-index-core
65
- dill==0.3.8
66
  # via
67
  # datasets
68
  # multiprocess
69
- dirtyjson==1.0.8
70
- # via llama-index-core
71
- distro==1.9.0
72
- # via openai
73
- exceptiongroup==1.2.2
74
  # via anyio
75
- faiss-cpu==1.9.0.post1
76
- # via ragatouille
77
- fast-pytorch-kmeans==0.2.0.1
78
- # via ragatouille
79
- fastapi==0.115.6
80
  # via gradio
81
- ffmpy==0.5.0
82
  # via gradio
83
- filelock==3.16.1
84
  # via
85
  # datasets
86
  # huggingface-hub
87
  # torch
88
  # transformers
89
- # triton
90
- filetype==1.2.0
91
- # via llama-index-core
92
- flask==3.1.0
93
- # via colbert-ai
94
- frozenlist==1.5.0
95
  # via
96
  # aiohttp
97
  # aiosignal
98
- fsspec==2024.9.0
99
  # via
100
  # datasets
101
  # gradio-client
102
  # huggingface-hub
103
- # llama-index-core
104
  # torch
105
- git-python==1.0.3
106
- # via colbert-ai
107
- gitdb==4.0.12
108
- # via gitpython
109
- gitpython==3.1.44
110
- # via git-python
111
- gradio==5.39.0
112
- # via daily-papers (pyproject.toml)
113
- gradio-client==1.11.0
114
  # via gradio
115
- greenlet==3.1.1
116
- # via sqlalchemy
117
  groovy==0.1.2
118
  # via gradio
119
- h11==0.14.0
120
  # via
121
  # httpcore
122
  # uvicorn
123
- hf-transfer==0.1.9
124
- # via daily-papers (pyproject.toml)
125
- hf-xet==1.1.5
126
  # via huggingface-hub
127
- httpcore==1.0.7
128
  # via httpx
129
  httpx==0.28.1
130
  # via
 
131
  # gradio
132
  # gradio-client
133
- # langsmith
134
- # llama-cloud
135
- # llama-index-core
136
- # openai
137
  # safehttpx
138
- huggingface-hub==0.34.3
 
139
  # via
140
  # datasets
141
  # gradio
@@ -143,401 +95,222 @@ huggingface-hub==0.34.3
143
  # sentence-transformers
144
  # tokenizers
145
  # transformers
146
- idna==3.10
147
  # via
148
  # anyio
149
  # httpx
150
  # requests
151
  # yarl
152
- itsdangerous==2.2.0
153
- # via flask
154
- jinja2==3.1.5
155
  # via
156
- # flask
157
  # gradio
158
  # torch
159
- jiter==0.8.2
160
- # via openai
161
- joblib==1.4.2
162
- # via
163
- # nltk
164
- # scikit-learn
165
- jsonpatch==1.33
166
- # via langchain-core
167
- jsonpointer==3.0.0
168
- # via jsonpatch
169
- langchain==0.3.14
170
- # via ragatouille
171
- langchain-core==0.3.29
172
- # via
173
- # langchain
174
- # langchain-text-splitters
175
- # ragatouille
176
- langchain-text-splitters==0.3.5
177
- # via langchain
178
- langsmith==0.2.10
179
- # via
180
- # langchain
181
- # langchain-core
182
- llama-cloud==0.1.8
183
- # via llama-index-indices-managed-llama-cloud
184
- llama-index==0.12.10
185
- # via ragatouille
186
- llama-index-agent-openai==0.4.1
187
- # via
188
- # llama-index
189
- # llama-index-program-openai
190
- llama-index-cli==0.4.0
191
- # via llama-index
192
- llama-index-core==0.12.10.post1
193
- # via
194
- # llama-index
195
- # llama-index-agent-openai
196
- # llama-index-cli
197
- # llama-index-embeddings-openai
198
- # llama-index-indices-managed-llama-cloud
199
- # llama-index-llms-openai
200
- # llama-index-multi-modal-llms-openai
201
- # llama-index-program-openai
202
- # llama-index-question-gen-openai
203
- # llama-index-readers-file
204
- # llama-index-readers-llama-parse
205
- # llama-parse
206
- llama-index-embeddings-openai==0.3.1
207
- # via
208
- # llama-index
209
- # llama-index-cli
210
- llama-index-indices-managed-llama-cloud==0.6.3
211
- # via llama-index
212
- llama-index-llms-openai==0.3.13
213
- # via
214
- # llama-index
215
- # llama-index-agent-openai
216
- # llama-index-cli
217
- # llama-index-multi-modal-llms-openai
218
- # llama-index-program-openai
219
- # llama-index-question-gen-openai
220
- llama-index-multi-modal-llms-openai==0.4.2
221
- # via llama-index
222
- llama-index-program-openai==0.3.1
223
- # via
224
- # llama-index
225
- # llama-index-question-gen-openai
226
- llama-index-question-gen-openai==0.3.0
227
- # via llama-index
228
- llama-index-readers-file==0.4.3
229
- # via llama-index
230
- llama-index-readers-llama-parse==0.4.0
231
- # via llama-index
232
- llama-parse==0.5.19
233
- # via llama-index-readers-llama-parse
234
- markdown-it-py==3.0.0
235
  # via rich
236
- markupsafe==2.1.5
237
  # via
238
  # gradio
239
  # jinja2
240
- # werkzeug
241
- marshmallow==3.25.0
242
- # via dataclasses-json
243
  mdurl==0.1.2
244
  # via markdown-it-py
245
  mpmath==1.3.0
246
  # via sympy
247
- multidict==6.1.0
248
  # via
249
  # aiohttp
250
  # yarl
251
- multiprocess==0.70.16
252
  # via datasets
253
- mypy-extensions==1.0.0
254
- # via typing-inspect
255
- nest-asyncio==1.6.0
256
- # via llama-index-core
257
  networkx==3.4.2
258
- # via
259
- # llama-index-core
260
- # torch
261
- ninja==1.11.1.3
262
- # via colbert-ai
263
- nltk==3.9.1
264
- # via
265
- # llama-index
266
- # llama-index-core
267
- numpy==1.26.4
268
  # via
269
  # datasets
270
  # faiss-cpu
271
- # fast-pytorch-kmeans
272
  # gradio
273
- # langchain
274
- # llama-index-core
275
- # onnx
276
  # pandas
277
  # scikit-learn
278
  # scipy
279
- # sentence-transformers
280
  # transformers
281
- # voyager
282
- nvidia-cublas-cu12==12.4.5.8
283
  # via
284
  # nvidia-cudnn-cu12
285
  # nvidia-cusolver-cu12
286
  # torch
287
- nvidia-cuda-cupti-cu12==12.4.127
 
 
288
  # via torch
289
- nvidia-cuda-nvrtc-cu12==12.4.127
290
  # via torch
291
- nvidia-cuda-runtime-cu12==12.4.127
292
  # via torch
293
- nvidia-cudnn-cu12==9.1.0.70
294
  # via torch
295
- nvidia-cufft-cu12==11.2.1.3
296
  # via torch
297
- nvidia-curand-cu12==10.3.5.147
298
  # via torch
299
- nvidia-cusolver-cu12==11.6.1.9
300
  # via torch
301
- nvidia-cusparse-cu12==12.3.1.170
302
  # via
303
  # nvidia-cusolver-cu12
304
  # torch
305
- nvidia-ml-py==12.560.30
306
- # via pynvml
307
- nvidia-nccl-cu12==2.21.5
308
  # via torch
309
- nvidia-nvjitlink-cu12==12.4.127
 
 
310
  # via
 
311
  # nvidia-cusolver-cu12
312
  # nvidia-cusparse-cu12
313
  # torch
314
- nvidia-nvtx-cu12==12.4.127
315
  # via torch
316
- onnx==1.17.0
317
- # via ragatouille
318
- openai==1.59.6
319
- # via
320
- # llama-index-agent-openai
321
- # llama-index-embeddings-openai
322
- # llama-index-llms-openai
323
- orjson==3.10.14
324
- # via
325
- # gradio
326
- # langsmith
327
- packaging==24.2
328
  # via
329
  # datasets
330
  # faiss-cpu
331
  # gradio
332
  # gradio-client
333
  # huggingface-hub
334
- # langchain-core
335
- # marshmallow
336
  # transformers
337
- pandas==2.2.3
338
  # via
339
  # datasets
340
  # gradio
341
- # llama-index-readers-file
342
- pillow==11.1.0
343
- # via
344
- # gradio
345
- # llama-index-core
346
- # sentence-transformers
347
- propcache==0.2.1
348
  # via
349
  # aiohttp
350
  # yarl
351
- protobuf==5.29.3
352
- # via onnx
353
- pyarrow==18.1.0
354
  # via datasets
355
- pydantic==2.10.5
356
  # via
357
  # fastapi
358
  # gradio
359
- # langchain
360
- # langchain-core
361
- # langsmith
362
- # llama-cloud
363
- # llama-index-core
364
- # llama-parse
365
- # openai
366
- pydantic-core==2.27.2
367
  # via pydantic
368
  pydub==0.25.1
369
  # via gradio
370
- pygments==2.19.1
371
  # via rich
372
- pynvml==12.0.0
373
- # via fast-pytorch-kmeans
374
- pypdf==5.1.0
375
- # via llama-index-readers-file
376
  python-dateutil==2.9.0.post0
377
  # via pandas
378
- python-dotenv==1.0.1
379
- # via colbert-ai
380
- python-multipart==0.0.20
381
  # via gradio
382
- pytz==2024.2
383
  # via pandas
384
- pyyaml==6.0.2
385
  # via
386
  # datasets
387
  # gradio
388
  # huggingface-hub
389
- # langchain
390
- # langchain-core
391
- # llama-index-core
392
  # transformers
393
- ragatouille==0.0.8.post4
394
- # via daily-papers (pyproject.toml)
395
- regex==2024.11.6
396
- # via
397
- # nltk
398
- # tiktoken
399
- # transformers
400
- requests==2.32.3
401
  # via
402
  # datasets
403
  # huggingface-hub
404
- # langchain
405
- # langsmith
406
- # llama-index-core
407
- # requests-toolbelt
408
- # tiktoken
409
  # transformers
410
- requests-toolbelt==1.0.0
411
- # via langsmith
412
- rich==13.9.4
413
  # via typer
414
- ruff==0.12.2
415
- # via gradio
416
- safehttpx==0.1.6
417
  # via gradio
418
- safetensors==0.5.2
419
  # via transformers
420
- scikit-learn==1.6.1
421
  # via sentence-transformers
422
- scipy==1.15.1
423
  # via
424
- # colbert-ai
425
  # scikit-learn
426
  # sentence-transformers
427
  semantic-version==2.10.0
428
  # via gradio
429
- sentence-transformers==2.7.0
430
- # via ragatouille
431
- setuptools==75.8.0
432
  # via daily-papers (pyproject.toml)
 
 
433
  shellingham==1.5.4
434
  # via typer
435
  six==1.17.0
436
  # via python-dateutil
437
- smmap==5.0.2
438
- # via gitdb
439
- sniffio==1.3.1
440
- # via
441
- # anyio
442
- # openai
443
- soupsieve==2.6
444
- # via beautifulsoup4
445
- sqlalchemy==2.0.37
446
- # via
447
- # langchain
448
- # llama-index-core
449
- srsly==2.4.8
450
- # via ragatouille
451
- starlette==0.41.3
452
  # via
453
  # fastapi
454
  # gradio
455
- striprtf==0.0.26
456
- # via llama-index-readers-file
457
- sympy==1.13.1
458
  # via torch
459
- tenacity==9.0.0
460
- # via
461
- # langchain
462
- # langchain-core
463
- # llama-index-core
464
- threadpoolctl==3.5.0
465
  # via scikit-learn
466
- tiktoken==0.8.0
467
- # via llama-index-core
468
- tokenizers==0.21.0
469
  # via transformers
470
- tomlkit==0.13.2
471
  # via gradio
472
- torch==2.5.1
473
  # via
474
- # fast-pytorch-kmeans
475
- # ragatouille
476
  # sentence-transformers
477
  tqdm==4.67.1
478
  # via
479
- # colbert-ai
480
  # datasets
481
  # huggingface-hub
482
- # llama-index-core
483
- # nltk
484
- # openai
485
  # sentence-transformers
486
  # transformers
487
- transformers==4.48.0
488
- # via
489
- # colbert-ai
490
- # ragatouille
491
- # sentence-transformers
492
- triton==3.1.0
493
  # via torch
494
- typer==0.15.1
495
  # via gradio
496
- typing-extensions==4.12.2
497
  # via
 
498
  # anyio
 
499
  # fastapi
500
  # gradio
501
  # gradio-client
502
  # huggingface-hub
503
- # langchain-core
504
- # llama-index-core
505
  # multidict
506
- # openai
507
  # pydantic
508
  # pydantic-core
509
- # pypdf
510
- # rich
511
- # sqlalchemy
512
  # torch
513
  # typer
514
- # typing-inspect
515
  # uvicorn
516
- typing-inspect==0.9.0
517
- # via
518
- # dataclasses-json
519
- # llama-index-core
520
- tzdata==2024.2
521
  # via pandas
522
- tzlocal==5.2
523
- # via apscheduler
524
- ujson==5.10.0
525
- # via colbert-ai
526
- urllib3==2.3.0
527
  # via requests
528
- uvicorn==0.34.0
529
  # via gradio
530
- voyager==2.1.0
531
- # via ragatouille
532
- websockets==14.1
533
- # via gradio-client
534
- werkzeug==3.1.3
535
- # via flask
536
- wrapt==1.17.1
537
- # via
538
- # deprecated
539
- # llama-index-core
540
- xxhash==3.5.0
541
  # via datasets
542
- yarl==1.18.3
543
  # via aiohttp
 
1
  # This file was autogenerated by uv via the following command:
2
  # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==24.1.0
4
  # via gradio
5
+ aiohappyeyeballs==2.6.1
6
  # via aiohttp
7
+ aiohttp==3.13.2
8
+ # via fsspec
9
+ aiosignal==1.4.0
 
 
 
10
  # via aiohttp
11
+ annotated-doc==0.0.4
12
+ # via fastapi
13
  annotated-types==0.7.0
14
  # via pydantic
15
+ anyio==4.12.0
16
  # via
17
  # gradio
18
  # httpx
 
19
  # starlette
20
+ async-timeout==5.0.1
 
 
 
 
 
 
21
  # via aiohttp
22
+ attrs==25.4.0
23
+ # via aiohttp
24
+ brotli==1.2.0
 
 
 
 
25
  # via gradio
26
+ certifi==2025.11.12
 
 
27
  # via
28
  # httpcore
29
  # httpx
 
30
  # requests
31
+ charset-normalizer==3.4.4
32
  # via requests
33
+ click==8.3.1
34
  # via
 
 
 
35
  # typer
36
  # uvicorn
37
+ datasets==4.4.1
38
+ # via daily-papers (pyproject.toml)
39
+ dill==0.4.0
 
 
 
 
 
 
 
 
40
  # via
41
  # datasets
42
  # multiprocess
43
+ exceptiongroup==1.3.1
 
 
 
 
44
  # via anyio
45
+ faiss-cpu==1.13.1
46
+ # via daily-papers (pyproject.toml)
47
+ fastapi==0.125.0
 
 
48
  # via gradio
49
+ ffmpy==1.0.0
50
  # via gradio
51
+ filelock==3.20.1
52
  # via
53
  # datasets
54
  # huggingface-hub
55
  # torch
56
  # transformers
57
+ frozenlist==1.8.0
 
 
 
 
 
58
  # via
59
  # aiohttp
60
  # aiosignal
61
+ fsspec==2025.10.0
62
  # via
63
  # datasets
64
  # gradio-client
65
  # huggingface-hub
 
66
  # torch
67
+ gradio==6.1.0
68
+ # via
69
+ # daily-papers (pyproject.toml)
70
+ # spaces
71
+ gradio-client==2.0.1
 
 
 
 
72
  # via gradio
 
 
73
  groovy==0.1.2
74
  # via gradio
75
+ h11==0.16.0
76
  # via
77
  # httpcore
78
  # uvicorn
79
+ hf-xet==1.2.0
 
 
80
  # via huggingface-hub
81
+ httpcore==1.0.9
82
  # via httpx
83
  httpx==0.28.1
84
  # via
85
+ # datasets
86
  # gradio
87
  # gradio-client
 
 
 
 
88
  # safehttpx
89
+ # spaces
90
+ huggingface-hub==0.36.0
91
  # via
92
  # datasets
93
  # gradio
 
95
  # sentence-transformers
96
  # tokenizers
97
  # transformers
98
+ idna==3.11
99
  # via
100
  # anyio
101
  # httpx
102
  # requests
103
  # yarl
104
+ jinja2==3.1.6
 
 
105
  # via
 
106
  # gradio
107
  # torch
108
+ joblib==1.5.3
109
+ # via scikit-learn
110
+ markdown-it-py==4.0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # via rich
112
+ markupsafe==3.0.3
113
  # via
114
  # gradio
115
  # jinja2
 
 
 
116
  mdurl==0.1.2
117
  # via markdown-it-py
118
  mpmath==1.3.0
119
  # via sympy
120
+ multidict==6.7.0
121
  # via
122
  # aiohttp
123
  # yarl
124
+ multiprocess==0.70.18
125
  # via datasets
 
 
 
 
126
  networkx==3.4.2
127
+ # via torch
128
+ numpy==2.2.6
 
 
 
 
 
 
 
 
129
  # via
130
  # datasets
131
  # faiss-cpu
 
132
  # gradio
 
 
 
133
  # pandas
134
  # scikit-learn
135
  # scipy
 
136
  # transformers
137
+ nvidia-cublas-cu12==12.8.4.1
 
138
  # via
139
  # nvidia-cudnn-cu12
140
  # nvidia-cusolver-cu12
141
  # torch
142
+ nvidia-cuda-cupti-cu12==12.8.90
143
+ # via torch
144
+ nvidia-cuda-nvrtc-cu12==12.8.93
145
  # via torch
146
+ nvidia-cuda-runtime-cu12==12.8.90
147
  # via torch
148
+ nvidia-cudnn-cu12==9.10.2.21
149
  # via torch
150
+ nvidia-cufft-cu12==11.3.3.83
151
  # via torch
152
+ nvidia-cufile-cu12==1.13.1.3
153
  # via torch
154
+ nvidia-curand-cu12==10.3.9.90
155
  # via torch
156
+ nvidia-cusolver-cu12==11.7.3.90
157
  # via torch
158
+ nvidia-cusparse-cu12==12.5.8.93
159
  # via
160
  # nvidia-cusolver-cu12
161
  # torch
162
+ nvidia-cusparselt-cu12==0.7.1
 
 
163
  # via torch
164
+ nvidia-nccl-cu12==2.27.3
165
+ # via torch
166
+ nvidia-nvjitlink-cu12==12.8.93
167
  # via
168
+ # nvidia-cufft-cu12
169
  # nvidia-cusolver-cu12
170
  # nvidia-cusparse-cu12
171
  # torch
172
+ nvidia-nvtx-cu12==12.8.90
173
  # via torch
174
+ orjson==3.11.5
175
+ # via gradio
176
+ packaging==25.0
 
 
 
 
 
 
 
 
 
177
  # via
178
  # datasets
179
  # faiss-cpu
180
  # gradio
181
  # gradio-client
182
  # huggingface-hub
183
+ # spaces
 
184
  # transformers
185
+ pandas==2.3.3
186
  # via
187
  # datasets
188
  # gradio
189
+ pillow==12.0.0
190
+ # via gradio
191
+ polars==1.36.1
192
+ # via daily-papers (pyproject.toml)
193
+ polars-runtime-32==1.36.1
194
+ # via polars
195
+ propcache==0.4.1
196
  # via
197
  # aiohttp
198
  # yarl
199
+ psutil==5.9.8
200
+ # via spaces
201
+ pyarrow==22.0.0
202
  # via datasets
203
+ pydantic==2.12.4
204
  # via
205
  # fastapi
206
  # gradio
207
+ # spaces
208
+ pydantic-core==2.41.5
 
 
 
 
 
 
209
  # via pydantic
210
  pydub==0.25.1
211
  # via gradio
212
+ pygments==2.19.2
213
  # via rich
 
 
 
 
214
  python-dateutil==2.9.0.post0
215
  # via pandas
216
+ python-multipart==0.0.21
 
 
217
  # via gradio
218
+ pytz==2025.2
219
  # via pandas
220
+ pyyaml==6.0.3
221
  # via
222
  # datasets
223
  # gradio
224
  # huggingface-hub
 
 
 
225
  # transformers
226
+ regex==2025.11.3
227
+ # via transformers
228
+ requests==2.32.5
 
 
 
 
 
229
  # via
230
  # datasets
231
  # huggingface-hub
232
+ # spaces
 
 
 
 
233
  # transformers
234
+ rich==14.2.0
 
 
235
  # via typer
236
+ safehttpx==0.1.7
 
 
237
  # via gradio
238
+ safetensors==0.7.0
239
  # via transformers
240
+ scikit-learn==1.7.2
241
  # via sentence-transformers
242
+ scipy==1.15.3
243
  # via
 
244
  # scikit-learn
245
  # sentence-transformers
246
  semantic-version==2.10.0
247
  # via gradio
248
+ sentence-transformers==5.2.0
 
 
249
  # via daily-papers (pyproject.toml)
250
+ setuptools==80.9.0
251
+ # via triton
252
  shellingham==1.5.4
253
  # via typer
254
  six==1.17.0
255
  # via python-dateutil
256
+ spaces==0.44.0
257
+ # via daily-papers (pyproject.toml)
258
+ starlette==0.50.0
 
 
 
 
 
 
 
 
 
 
 
 
259
  # via
260
  # fastapi
261
  # gradio
262
+ sympy==1.14.0
 
 
263
  # via torch
264
+ threadpoolctl==3.6.0
 
 
 
 
 
265
  # via scikit-learn
266
+ tokenizers==0.22.1
 
 
267
  # via transformers
268
+ tomlkit==0.13.3
269
  # via gradio
270
+ torch==2.8.0
271
  # via
272
+ # daily-papers (pyproject.toml)
 
273
  # sentence-transformers
274
  tqdm==4.67.1
275
  # via
 
276
  # datasets
277
  # huggingface-hub
 
 
 
278
  # sentence-transformers
279
  # transformers
280
+ transformers==4.57.3
281
+ # via sentence-transformers
282
+ triton==3.4.0
 
 
 
283
  # via torch
284
+ typer==0.20.0
285
  # via gradio
286
+ typing-extensions==4.15.0
287
  # via
288
+ # aiosignal
289
  # anyio
290
+ # exceptiongroup
291
  # fastapi
292
  # gradio
293
  # gradio-client
294
  # huggingface-hub
 
 
295
  # multidict
 
296
  # pydantic
297
  # pydantic-core
298
+ # sentence-transformers
299
+ # spaces
300
+ # starlette
301
  # torch
302
  # typer
303
+ # typing-inspection
304
  # uvicorn
305
+ typing-inspection==0.4.2
306
+ # via pydantic
307
+ tzdata==2025.3
 
 
308
  # via pandas
309
+ urllib3==2.6.2
 
 
 
 
310
  # via requests
311
+ uvicorn==0.38.0
312
  # via gradio
313
+ xxhash==3.6.0
 
 
 
 
 
 
 
 
 
 
314
  # via datasets
315
+ yarl==1.22.0
316
  # via aiohttp
search.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import numpy as np
3
+ import polars as pl
4
+ import spaces
5
+ from datasets import Sequence, Value
6
+ from sentence_transformers import CrossEncoder, SentenceTransformer
7
+
8
+ from table import df_orig
9
+
10
+ ds = datasets.Dataset.from_polars(
11
+ df_orig.select(["arxiv_id", "title", "abstract", "embedding"]).filter(pl.col("embedding").is_not_null())
12
+ ).cast_column("embedding", Sequence(Value("float64")))
13
+ ds.add_faiss_index(column="embedding")
14
+
15
+ bi_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
16
+ ce_model = CrossEncoder("BAAI/bge-reranker-base")
17
+
18
+
19
+ @spaces.GPU(duration=10)
20
+ def search(query: str, candidate_pool_size: int = 100, retrieval_k: int = 50) -> list[dict]:
21
+ prefix = "Represent this sentence for searching relevant passages: "
22
+ q_vec = bi_model.encode(prefix + query, normalize_embeddings=True)
23
+
24
+ _, retrieved_ds = ds.get_nearest_examples("embedding", q_vec, k=candidate_pool_size)
25
+
26
+ ce_inputs = [
27
+ (query, f"{retrieved_ds['title'][i]} {retrieved_ds['abstract'][i]}") for i in range(len(retrieved_ds["title"]))
28
+ ]
29
+ ce_scores = ce_model.predict(ce_inputs, batch_size=16)
30
+
31
+ sorted_idx = np.argsort(ce_scores)[::-1]
32
+ return [
33
+ {"arxiv_id": retrieved_ds["arxiv_id"][i], "ce_score": float(ce_scores[i])} for i in sorted_idx[:retrieval_k]
34
+ ]
table.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import polars as pl
3
+
4
+ BASE_REPO_ID = "hysts-bot-data/daily-papers"
5
+ STATS_REPO_ID = "hysts-bot-data/daily-papers-stats"
6
+ EMBEDDING_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
7
+
8
+
9
+ df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars()
10
+ df_orig = df_orig.join(
11
+ datasets.load_dataset(STATS_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left"
12
+ ).join(datasets.load_dataset(EMBEDDING_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left")
13
+
14
+ # format date
15
+ df_orig = df_orig.with_columns(
16
+ pl.format(
17
+ "[{}](https://huggingface.co/papers/date/{})",
18
+ pl.col("date").dt.strftime("%Y-%m-%d"),
19
+ pl.col("date").dt.strftime("%Y-%m-%d"),
20
+ ).alias("date_md")
21
+ )
22
+ # format links
23
+ df_orig = df_orig.with_columns(
24
+ [
25
+ pl.when(pl.col(col).fill_null("") != pl.lit(""))
26
+ .then(pl.format("[github]({})", pl.col(col)))
27
+ .otherwise(pl.lit(""))
28
+ .alias(f"{col}_md")
29
+ for col in ["github"]
30
+ ]
31
+ )
32
+ # format paper page link
33
+ df_orig = df_orig.with_columns(
34
+ (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
35
+ ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
36
+
37
+ # sort by date (descending) and arxiv_id (descending)
38
+ df_orig = df_orig.sort(["date", "arxiv_id"], descending=True)
uv.lock CHANGED
The diff for this file is too large to render. See raw diff