import datasets import polars as pl BASE_REPO_ID = "hysts-bot-data/daily-papers" STATS_REPO_ID = "hysts-bot-data/daily-papers-stats" EMBEDDING_REPO_ID = "hysts-bot-data/daily-papers-abstract-index" df_orig = datasets.load_dataset(BASE_REPO_ID, split="train").to_polars() df_orig = df_orig.join( datasets.load_dataset(STATS_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left" ).join(datasets.load_dataset(EMBEDDING_REPO_ID, split="train").to_polars(), on="arxiv_id", how="left") # format date df_orig = df_orig.with_columns( pl.format( "[{}](https://huggingface.co/papers/date/{})", pl.col("date").dt.strftime("%Y-%m-%d"), pl.col("date").dt.strftime("%Y-%m-%d"), ).alias("date_md") ) # format links df_orig = df_orig.with_columns( [ pl.when(pl.col(col).fill_null("") != pl.lit("")) .then(pl.format("[github]({})", pl.col(col))) .otherwise(pl.lit("")) .alias(f"{col}_md") for col in ["github"] ] ) # format paper page link df_orig = df_orig.with_columns( (pl.lit("https://huggingface.co/papers/") + pl.col("arxiv_id")).alias("paper_page") ).with_columns(pl.format("[{}]({})", pl.col("arxiv_id"), pl.col("paper_page")).fill_null("").alias("paper_page_md")) # sort by date (descending) and arxiv_id (descending) df_orig = df_orig.sort(["date", "arxiv_id"], descending=True)