daily-papers / table.py
hysts's picture
hysts HF Staff
Update
78d77c9
import datasets
import polars as pl
BASE_REPO_ID = "hysts-bot-data/daily-papers"
STATS_REPO_ID = "hysts-bot-data/daily-papers-stats"
EMBEDDING_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars()
df_orig = df_orig.join(
datasets.load_dataset(STATS_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left"
).join(datasets.load_dataset(EMBEDDING_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left")
# format date
df_orig = df_orig.with_columns(
pl.format(
"[{}](https://huggingface.co/papers/date/{})",
pl.col("date").dt.strftime("%Y-%m-%d"),
pl.col("date").dt.strftime("%Y-%m-%d"),
).alias("date_md")
)
# format links
df_orig = df_orig.with_columns(
[
pl.when(pl.col(col).fill_null("") != pl.lit(""))
.then(pl.format("[github]({})", pl.col(col)))
.otherwise(pl.lit(""))
.alias(f"{col}_md")
for col in ["github"]
]
)
# format paper page link
df_orig = df_orig.with_columns(
(pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page")
).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md"))
# sort by date (descending) and arxiv_id (descending)
df_orig = df_orig.sort(["date", "arxiv_id"], descending=True)