File size: 1,445 Bytes
d1855f9
 
 
 
 
 
 
 
 
 
5e67170
d1855f9
 
 
 
 
 
 
 
 
 
 
5e67170
d1855f9
 
 
 
 
 
 
 
5e67170
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from datetime import date
from typing import Optional

from huggingface_hub import hf_hub_download, list_repo_files
import pandas as pd
from pandas import DataFrame


class DatasetData:
    __repo_id: str = "Ya-Alex/anki-addons"
    __revision: str = "main"

    def __init__(self):
        self.parquet_files: Optional[dict[date, DataFrame]] = None

    def get_parquet_dict(self) -> dict[date, DataFrame]:
        if self.parquet_files is None:
            parquet_files_paths: dict[date, str] = self.__list_history_parquet_files()
            self.parquet_files = {day: self.__read_parquet(file) for day, file in parquet_files_paths.items()}
        return self.parquet_files

    def __list_history_parquet_files(self) -> dict[date, str]:
        all_files: list[str] = list_repo_files(self.__repo_id, repo_type="dataset", revision=self.__revision)
        history_files: list[str] = [file for file in all_files if file.startswith("history")]
        parquet_files: list[str] = [file for file in history_files if file.endswith("data.parquet")]
        parquet_file_dict: dict[date, str] = {date.fromisoformat(file.split("/")[1]): file for file in parquet_files}
        return parquet_file_dict

    def __read_parquet(self, parquet_file: str) -> DataFrame:
        return pd.read_parquet(
            hf_hub_download(repo_id=self.__repo_id, filename=parquet_file, repo_type="dataset",
                            revision=self.__revision))