File size: 797 Bytes
fe276b5
 
 
 
 
 
f63178d
fe276b5
 
 
 
 
a6dee29
fe276b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
from pathlib import Path

import pandas as pd
from datasets import Dataset, DatasetDict

from src.utils.constants import DATASET_REPO_ID, EMBEDDING_MODEL_NAME, MODEL_REPO_ID
from src.utils.utils import get_timestamp

HF_TOKEN = os.environ.get("HF_TOKEN", None)


def save_dataset_to_hf_hub(topic_info_df, corpus, docs, filename):
    raw_df = pd.DataFrame({"text": corpus})

    intrim_df = pd.DataFrame({"text": docs})

    dataset = DatasetDict(
        {
            "input": Dataset.from_pandas(raw_df),
            "processed": Dataset.from_pandas(intrim_df),
            "output": Dataset.from_pandas(topic_info_df),
        }
    )

    dataset.push_to_hub(
        DATASET_REPO_ID + f"{Path(filename).stem}-{get_timestamp()}",
        private=True,
        token=HF_TOKEN,
    )