File size: 797 Bytes
fe276b5 f63178d fe276b5 a6dee29 fe276b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import os
from pathlib import Path
import pandas as pd
from datasets import Dataset, DatasetDict
from src.utils.constants import DATASET_REPO_ID, EMBEDDING_MODEL_NAME, MODEL_REPO_ID
from src.utils.utils import get_timestamp
HF_TOKEN = os.environ.get("HF_TOKEN", None)
def save_dataset_to_hf_hub(topic_info_df, corpus, docs, filename):
raw_df = pd.DataFrame({"text": corpus})
intrim_df = pd.DataFrame({"text": docs})
dataset = DatasetDict(
{
"input": Dataset.from_pandas(raw_df),
"processed": Dataset.from_pandas(intrim_df),
"output": Dataset.from_pandas(topic_info_df),
}
)
dataset.push_to_hub(
DATASET_REPO_ID + f"{Path(filename).stem}-{get_timestamp()}",
private=True,
token=HF_TOKEN,
)
|