BERT-training / dataset_handlig.py
picard.tseng
First commit:
050259a
# ==============================
# 準備資料集
# ==============================
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, HfFolder
import pandas as pd
import os
# ==============================
# 參數設定
# ==============================
csv_path = "./datasets/intent_classification_200.csv" # 你的CSV路徑
test_sample_size = 25 # 要分配多少筆到 test 集
repo_name = "picard47at/dataset1" # Hugging Face dataset repo 名稱
hf_token = os.environ["TOGETHER_API_KEY"]
# ==============================
# 讀取 CSV
# ==============================
df = pd.read_csv(csv_path)
# 確保只有 text 和 intent 欄位
df = df[['text', 'intent']]
# 總資料筆數
num_samples = len(df)
# ==============================
# 分成 train / test 並轉換成 DatasetDict
# ==============================
if num_samples <= test_sample_size:
# 不夠分配,全部放 train
dataset_dict = DatasetDict({
"train": Dataset.from_pandas(df)
})
print(f"資料筆數不足 {test_sample_size},全部 {num_samples} 筆作為訓練資料")
else:
# 隨機重排資料
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# 切分
test_df = df.iloc[:test_sample_size]
train_df = df.iloc[test_sample_size:]
# 建立 DatasetDict
dataset_dict = DatasetDict({
"train": Dataset.from_pandas(train_df),
"test": Dataset.from_pandas(test_df)
})
print(f"分配 {len(train_df)} 筆到 train,{len(test_df)} 筆到 test")
# 登入 Hugging Face
HfFolder.save_token(hf_token)
# 7. 上傳到 Hugging Face Datasets
api = HfApi()
# 建立新的 dataset repo(如果還沒建)
api.create_repo(repo_name, repo_type="dataset", exist_ok=True)
'''
# 上傳資料夾
api.upload_folder(
folder_path=output_dir,
path_in_repo=".",
repo_id=repo_name,
repo_type="dataset"
)
'''
# ==============================
# 上傳到 Hugging Face Hub (push_to_hub)
# ==============================
# push_to_hub 直接上傳 DatasetDict
dataset_dict.push_to_hub(repo_name)
print(f"資料集上傳完成:https://huggingface.co/datasets/{repo_name}")
dataset_dict.save_to_disk("./")