import os from pathlib import Path import pandas as pd from datasets import Dataset, DatasetDict from src.utils.constants import DATASET_REPO_ID, EMBEDDING_MODEL_NAME, MODEL_REPO_ID from src.utils.utils import get_timestamp HF_TOKEN = os.environ.get("HF_TOKEN", None) def save_dataset_to_hf_hub(topic_info_df, corpus, docs, filename): raw_df = pd.DataFrame({"text": corpus}) intrim_df = pd.DataFrame({"text": docs}) dataset = DatasetDict( { "input": Dataset.from_pandas(raw_df), "processed": Dataset.from_pandas(intrim_df), "output": Dataset.from_pandas(topic_info_df), } ) dataset.push_to_hub( DATASET_REPO_ID + f"{Path(filename).stem}-{get_timestamp()}", private=True, token=HF_TOKEN, )