File size: 8,653 Bytes
3742716 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | # upload_dataset_to_hf.py
import argparse
import os
import sys
import subprocess
from datasets import load_dataset, DatasetDict, Features, Value
from huggingface_hub import HfApi, HfFolder, login, HfApi
# Added import for HfFolder
# --- Helper Function to Check Git LFS ---
def check_git_lfs_installed():
"""Checks if git-lfs is installed and configured."""
try:
# Check if git-lfs command exists
subprocess.run(["git", "lfs", "--version"], check=True, capture_output=True)
# Check if git-lfs is initialized for the user (optional but good practice)
# This command might vary or not be strictly necessary depending on setup
# subprocess.run(["git", "config", "--global", "--get", "filter.lfs.smudge"], check=True, capture_output=True)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
print("Warning: git-lfs command not found or not configured.")
print(" Please install git-lfs and run 'git lfs install --system' (or --user).")
print(" See: https://git-lfs.com/")
# Optionally exit if git-lfs is strictly required
# sys.exit(1)
return False # Allow script to continue but warn user
# --- Main Script Logic ---
def main():
parser = argparse.ArgumentParser(
description="Upload CSV dataset splits from a local directory to the Hugging Face Hub."
)
# --- Required Arguments ---
parser.add_argument(
"--local_dir",
type=str,
required=True,
help="Path to the local directory containing the dataset CSV files."
)
parser.add_argument(
"--repo_id",
type=str,
required=True,
help="The Hugging Face Hub repository ID (e.g., 'username/my-equation-dataset')."
)
parser.add_argument(
"--data_column",
type=str,
required=True,
help="Name of the column in the CSV files containing the actual data (e.g., 'text', 'equation')."
)
# --- Optional Arguments ---
parser.add_argument(
"--train_filename",
type=str,
default=None,
help="Filename of the training CSV within local_dir (e.g., 'train_data.csv')."
)
parser.add_argument(
"--val_filename",
type=str,
default=None,
help="Filename of the validation CSV within local_dir (e.g., 'validation_set.csv')."
)
parser.add_argument(
"--test_filename",
type=str,
default=None,
help="Filename of the test CSV within local_dir (optional, e.g., 'test_examples.csv')."
)
parser.add_argument(
"--hf_token",
type=str,
default=None,
help="Your Hugging Face Hub access token (with write permissions). If not provided, script will try to use cached token or prompt login."
)
parser.add_argument(
"--private",
action='store_true', # Makes the repo private if flag is present
help="Set the Hugging Face repository to private."
)
args = parser.parse_args()
print("--- Starting Dataset Upload Script ---")
# 1. Check Git LFS
print("Checking for git-lfs...")
check_git_lfs_installed() # Warns if not found
# 2. Handle Authentication
token = args.hf_token
if not token:
token = HfFolder.get_token() # Try to get cached token
if not token:
print("\nAttempting Hugging Face login...")
try:
login() # Will prompt user if not logged in via CLI
token = HfFolder.get_token() # Get token after successful login
if not token:
raise Exception("Login seemed successful but token could not be retrieved.")
except Exception as e:
print(f"Error during Hugging Face login: {e}")
print("Please ensure you are logged in via 'huggingface-cli login' or provide a token using --hf_token.")
sys.exit(1)
else:
print("Using provided/cached Hugging Face token.")
# Optionally verify token validity here if needed, though push_to_hub will fail if invalid
# 3. Determine Filenames
dir_name = os.path.basename(os.path.normpath(args.local_dir)) # Gets the last part of the path
train_file = args.train_filename if args.train_filename else f"train_{dir_name}.csv"
val_file = args.val_filename if args.val_filename else f"val_{dir_name}.csv" # Using 'val' as abbreviation
test_file = args.test_filename if args.test_filename else f"test_{dir_name}.csv"
print(f"Using directory: {args.local_dir}")
print(f"Target Hub repo: {args.repo_id}")
print(f"Expecting data column: '{args.data_column}'")
print(f"Using train file: '{train_file}'")
print(f"Using validation file: '{val_file}'")
# Test file is optional, only check if default or specific name provided
if args.test_filename or os.path.exists(os.path.join(args.local_dir, test_file)):
print(f"Using test file: '{test_file}'")
else:
print("No test file specified or default test file not found, skipping.")
test_file = None # Ensure test_file is None if not used
# 4. Construct Full Paths and Check Existence
train_path = os.path.join(args.local_dir, train_file)
val_path = os.path.join(args.local_dir, val_file)
test_path = os.path.join(args.local_dir, test_file) if test_file else None
data_files = {}
if os.path.exists(train_path):
data_files["train"] = train_path
else:
print(f"Error: Training file not found at '{train_path}'")
sys.exit(1)
if os.path.exists(val_path):
data_files["validation"] = val_path
else:
print(f"Error: Validation file not found at '{val_path}'")
sys.exit(1)
if test_path and os.path.exists(test_path):
data_files["test"] = test_path
elif args.test_filename: # If user specified a test file but it wasn't found
print(f"Warning: Specified test file '{args.test_filename}' not found at '{test_path}'. Skipping test split.")
# 5. Load Dataset Locally
print("\nLoading local CSV files...")
try:
# Define features to ensure the data column is read as string
features = Features({args.data_column: Value('string')})
dataset_dict = load_dataset("csv", data_files=data_files, features=features)
print("Local dataset loaded successfully:")
print(dataset_dict)
# Verify the data column exists in the loaded dataset
for split in dataset_dict:
if args.data_column not in dataset_dict[split].column_names:
print(f"Error: Column '{args.data_column}' not found in loaded '{split}' split.")
print(f"Available columns: {dataset_dict[split].column_names}")
sys.exit(1)
except Exception as e:
print(f"Error loading dataset from CSV files: {e}")
print("Please check file paths, CSV format, and column names.")
sys.exit(1)
# 6. Rename column if necessary (optional, often good to standardize to 'text')
# If you always want the main data column to be named 'text' on the Hub:
if args.data_column != 'text':
print(f"Renaming column '{args.data_column}' to 'text'...")
try:
dataset_dict = dataset_dict.rename_column(args.data_column, "text")
print("Column renamed successfully.")
print(dataset_dict)
except Exception as e:
print(f"Error renaming column: {e}")
# Decide if you want to exit or proceed with the original column name
# sys.exit(1)
# 7. Push to Hub
print(f"\nAttempting to push dataset to Hub repository: {args.repo_id}...")
try:
dataset_dict.push_to_hub(
repo_id=args.repo_id,
private=args.private,
token=token # Pass token explicitly
)
print("\n--- Upload Successful! ---")
hub_url = f"https://huggingface.co/datasets/{args.repo_id}"
print(f"Dataset available at: {hub_url}")
except Exception as e:
print(f"\n--- Error During Upload ---")
print(f"An error occurred: {e}")
print("Possible causes:")
print("- Invalid Hugging Face token or insufficient permissions (needs write access).")
print("- Repository ID format incorrect (should be 'username/dataset_name').")
print("- Network issues.")
print("- Git LFS not installed or properly configured.")
print("- Conflicts if the repository already exists with incompatible content.")
sys.exit(1)
if __name__ == "__main__":
main() |