Spaces:
No application file
No application file
| # ============================== | |
| # 準備資料集 | |
| # ============================== | |
| from datasets import Dataset, DatasetDict | |
| from huggingface_hub import HfApi, HfFolder | |
| import pandas as pd | |
| import os | |
| # ============================== | |
| # 參數設定 | |
| # ============================== | |
| csv_path = "./datasets/intent_classification_200.csv" # 你的CSV路徑 | |
| test_sample_size = 25 # 要分配多少筆到 test 集 | |
| repo_name = "picard47at/dataset1" # Hugging Face dataset repo 名稱 | |
| hf_token = os.environ["TOGETHER_API_KEY"] | |
| # ============================== | |
| # 讀取 CSV | |
| # ============================== | |
| df = pd.read_csv(csv_path) | |
| # 確保只有 text 和 intent 欄位 | |
| df = df[['text', 'intent']] | |
| # 總資料筆數 | |
| num_samples = len(df) | |
| # ============================== | |
| # 分成 train / test 並轉換成 DatasetDict | |
| # ============================== | |
| if num_samples <= test_sample_size: | |
| # 不夠分配,全部放 train | |
| dataset_dict = DatasetDict({ | |
| "train": Dataset.from_pandas(df) | |
| }) | |
| print(f"資料筆數不足 {test_sample_size},全部 {num_samples} 筆作為訓練資料") | |
| else: | |
| # 隨機重排資料 | |
| df = df.sample(frac=1, random_state=42).reset_index(drop=True) | |
| # 切分 | |
| test_df = df.iloc[:test_sample_size] | |
| train_df = df.iloc[test_sample_size:] | |
| # 建立 DatasetDict | |
| dataset_dict = DatasetDict({ | |
| "train": Dataset.from_pandas(train_df), | |
| "test": Dataset.from_pandas(test_df) | |
| }) | |
| print(f"分配 {len(train_df)} 筆到 train,{len(test_df)} 筆到 test") | |
| # 登入 Hugging Face | |
| HfFolder.save_token(hf_token) | |
| # 7. 上傳到 Hugging Face Datasets | |
| api = HfApi() | |
| # 建立新的 dataset repo(如果還沒建) | |
| api.create_repo(repo_name, repo_type="dataset", exist_ok=True) | |
| ''' | |
| # 上傳資料夾 | |
| api.upload_folder( | |
| folder_path=output_dir, | |
| path_in_repo=".", | |
| repo_id=repo_name, | |
| repo_type="dataset" | |
| ) | |
| ''' | |
| # ============================== | |
| # 上傳到 Hugging Face Hub (push_to_hub) | |
| # ============================== | |
| # push_to_hub 直接上傳 DatasetDict | |
| dataset_dict.push_to_hub(repo_name) | |
| print(f"資料集上傳完成:https://huggingface.co/datasets/{repo_name}") | |
| dataset_dict.save_to_disk("./") |