| | import operator |
| |
|
| | import datasets |
| | import pandas as pd |
| | from huggingface_hub import HfApi |
| | from math import isnan |
| |
|
| | api = HfApi() |
| |
|
| |
|
| | class PaperList: |
| | COLUMN_INFO = [ |
| | ["ID", "str"], |
| | ["Title", "str"], |
| | ["Authors", "str"], |
| | ["Paper page", "markdown"], |
| | ["GitHub", "markdown"], |
| | ["Spaces", "markdown"], |
| | ["Models", "markdown"], |
| | ["Datasets", "markdown"], |
| | ] |
| |
|
| | def __init__(self): |
| | self.df_raw = self.get_df() |
| | self.df_prettified = self.prettify(self.df_raw) |
| |
|
| | @staticmethod |
| | def get_df() -> pd.DataFrame: |
| | df = datasets.load_dataset("CVPR2024/CVPR2024-papers", split="train").to_pandas() |
| | df["paper_page"] = df["arxiv_id"].apply( |
| | lambda arxiv_id: f"https://huggingface.co/papers/{arxiv_id}" if not isnan(arxiv_id) else "" |
| | ) |
| | return df |
| |
|
| | @staticmethod |
| | def create_link(text: str, url: str) -> str: |
| | return f'<a href="{url}" target="_blank">{text}</a>' |
| |
|
| | @staticmethod |
| | def prettify(df: pd.DataFrame) -> pd.DataFrame: |
| | rows = [] |
| | for _, row in df.iterrows(): |
| | new_row = { |
| | "ID": row["id"], |
| | "Title": row["title"], |
| | "Authors": row["authors"], |
| | "Paper page": PaperList.create_link(row["arxiv_id"], row["paper_page"]) if not isnan(row["arxiv_id"]) else " ", |
| | "GitHub": "\n".join([PaperList.create_link("GitHub", url) for url in row["GitHub"]] if row["GitHub"]!="[]" else " "), |
| | "Spaces": "\n".join( |
| | [ |
| | PaperList.create_link(repo_id, f"https://huggingface.co/spaces/{repo_id}") |
| | for repo_id in row["Space"] |
| | ] if row["Space"] != "[]" else [" "]), |
| | "Models": "\n".join( |
| | [PaperList.create_link(repo_id, f"https://huggingface.co/{repo_id}") for repo_id in row["Model"]] |
| | if row["Model"] != "[]" else [" "]) , |
| | "Datasets": "\n".join( |
| | [ |
| | PaperList.create_link(repo_id, f"https://huggingface.co/datasets/{repo_id}") |
| | for repo_id in row["Dataset"] |
| | ] if row["Dataset"] != "[]" else [" "] |
| | ), |
| | } |
| | rows.append(new_row) |
| | return pd.DataFrame(rows, columns=PaperList.get_column_names()) |
| |
|
| | @staticmethod |
| | def get_column_names(): |
| | return list(map(operator.itemgetter(0), PaperList.COLUMN_INFO)) |
| |
|
| | def get_column_datatypes(self, column_names: list[str]) -> list[str]: |
| | mapping = dict(self.COLUMN_INFO) |
| | return [mapping[name] for name in column_names] |
| |
|
| | def search( |
| | self, |
| | title_search_query: str, |
| | author_search_query: str, |
| | ) -> pd.DataFrame: |
| | df = self.df_raw.copy() |
| | df = df[df["title"].str.contains(title_search_query, case=False)] |
| | df = df[df["authors"].str.contains(author_search_query, case=False)] |
| | return self.prettify(df) |
| |
|