Spaces:
Running
on
Zero
Running
on
Zero
| import datasets | |
| import polars as pl | |
| BASE_REPO_ID = "hysts-bot-data/daily-papers" | |
| STATS_REPO_ID = "hysts-bot-data/daily-papers-stats" | |
| EMBEDDING_REPO_ID = "hysts-bot-data/daily-papers-abstract-index" | |
| df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars() | |
| df_orig = df_orig.join( | |
| datasets.load_dataset(STATS_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left" | |
| ).join(datasets.load_dataset(EMBEDDING_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left") | |
| # format date | |
| df_orig = df_orig.with_columns( | |
| pl.format( | |
| "[{}](https://huggingface.co/papers/date/{})", | |
| pl.col("date").dt.strftime("%Y-%m-%d"), | |
| pl.col("date").dt.strftime("%Y-%m-%d"), | |
| ).alias("date_md") | |
| ) | |
| # format links | |
| df_orig = df_orig.with_columns( | |
| [ | |
| pl.when(pl.col(col).fill_null("") != pl.lit("")) | |
| .then(pl.format("[github]({})", pl.col(col))) | |
| .otherwise(pl.lit("")) | |
| .alias(f"{col}_md") | |
| for col in ["github"] | |
| ] | |
| ) | |
| # format paper page link | |
| df_orig = df_orig.with_columns( | |
| (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page") | |
| ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md")) | |
| # sort by date (descending) and arxiv_id (descending) | |
| df_orig = df_orig.sort(["date", "arxiv_id"], descending=True) | |