File size: 8,653 Bytes
3742716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# upload_dataset_to_hf.py

import argparse
import os
import sys
import subprocess
from datasets import load_dataset, DatasetDict, Features, Value
from huggingface_hub import HfApi, HfFolder, login, HfApi
# Added import for HfFolder

# --- Helper Function to Check Git LFS ---
def check_git_lfs_installed():
    """Checks if git-lfs is installed and configured."""
    try:
        # Check if git-lfs command exists
        subprocess.run(["git", "lfs", "--version"], check=True, capture_output=True)
        # Check if git-lfs is initialized for the user (optional but good practice)
        # This command might vary or not be strictly necessary depending on setup
        # subprocess.run(["git", "config", "--global", "--get", "filter.lfs.smudge"], check=True, capture_output=True)
        return True
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("Warning: git-lfs command not found or not configured.")
        print("         Please install git-lfs and run 'git lfs install --system' (or --user).")
        print("         See: https://git-lfs.com/")
        # Optionally exit if git-lfs is strictly required
        # sys.exit(1)
        return False # Allow script to continue but warn user

# --- Main Script Logic ---
def main():
    parser = argparse.ArgumentParser(
        description="Upload CSV dataset splits from a local directory to the Hugging Face Hub."
    )

    # --- Required Arguments ---
    parser.add_argument(
        "--local_dir",
        type=str,
        required=True,
        help="Path to the local directory containing the dataset CSV files."
    )
    parser.add_argument(
        "--repo_id",
        type=str,
        required=True,
        help="The Hugging Face Hub repository ID (e.g., 'username/my-equation-dataset')."
    )
    parser.add_argument(
        "--data_column",
        type=str,
        required=True,
        help="Name of the column in the CSV files containing the actual data (e.g., 'text', 'equation')."
    )

    # --- Optional Arguments ---
    parser.add_argument(
        "--train_filename",
        type=str,
        default=None,
        help="Filename of the training CSV within local_dir (e.g., 'train_data.csv')."
    )
    parser.add_argument(
        "--val_filename",
        type=str,
        default=None,
        help="Filename of the validation CSV within local_dir (e.g., 'validation_set.csv')."
    )
    parser.add_argument(
        "--test_filename",
        type=str,
        default=None,
        help="Filename of the test CSV within local_dir (optional, e.g., 'test_examples.csv')."
    )
    parser.add_argument(
        "--hf_token",
        type=str,
        default=None,
        help="Your Hugging Face Hub access token (with write permissions). If not provided, script will try to use cached token or prompt login."
    )
    parser.add_argument(
        "--private",
        action='store_true', # Makes the repo private if flag is present
        help="Set the Hugging Face repository to private."
    )

    args = parser.parse_args()

    print("--- Starting Dataset Upload Script ---")

    # 1. Check Git LFS
    print("Checking for git-lfs...")
    check_git_lfs_installed() # Warns if not found

    # 2. Handle Authentication
    token = args.hf_token
    if not token:
        token = HfFolder.get_token() # Try to get cached token

    if not token:
        print("\nAttempting Hugging Face login...")
        try:
            login() # Will prompt user if not logged in via CLI
            token = HfFolder.get_token() # Get token after successful login
            if not token:
                 raise Exception("Login seemed successful but token could not be retrieved.")
        except Exception as e:
            print(f"Error during Hugging Face login: {e}")
            print("Please ensure you are logged in via 'huggingface-cli login' or provide a token using --hf_token.")
            sys.exit(1)
    else:
         print("Using provided/cached Hugging Face token.")
         # Optionally verify token validity here if needed, though push_to_hub will fail if invalid


    # 3. Determine Filenames
    dir_name = os.path.basename(os.path.normpath(args.local_dir)) # Gets the last part of the path

    train_file = args.train_filename if args.train_filename else f"train_{dir_name}.csv"
    val_file = args.val_filename if args.val_filename else f"val_{dir_name}.csv" # Using 'val' as abbreviation
    test_file = args.test_filename if args.test_filename else f"test_{dir_name}.csv"

    print(f"Using directory: {args.local_dir}")
    print(f"Target Hub repo: {args.repo_id}")
    print(f"Expecting data column: '{args.data_column}'")
    print(f"Using train file: '{train_file}'")
    print(f"Using validation file: '{val_file}'")
    # Test file is optional, only check if default or specific name provided
    if args.test_filename or os.path.exists(os.path.join(args.local_dir, test_file)):
         print(f"Using test file: '{test_file}'")
    else:
        print("No test file specified or default test file not found, skipping.")
        test_file = None # Ensure test_file is None if not used


    # 4. Construct Full Paths and Check Existence
    train_path = os.path.join(args.local_dir, train_file)
    val_path = os.path.join(args.local_dir, val_file)
    test_path = os.path.join(args.local_dir, test_file) if test_file else None

    data_files = {}
    if os.path.exists(train_path):
        data_files["train"] = train_path
    else:
        print(f"Error: Training file not found at '{train_path}'")
        sys.exit(1)

    if os.path.exists(val_path):
        data_files["validation"] = val_path
    else:
        print(f"Error: Validation file not found at '{val_path}'")
        sys.exit(1)

    if test_path and os.path.exists(test_path):
        data_files["test"] = test_path
    elif args.test_filename: # If user specified a test file but it wasn't found
         print(f"Warning: Specified test file '{args.test_filename}' not found at '{test_path}'. Skipping test split.")


    # 5. Load Dataset Locally
    print("\nLoading local CSV files...")
    try:
        # Define features to ensure the data column is read as string
        features = Features({args.data_column: Value('string')})
        dataset_dict = load_dataset("csv", data_files=data_files, features=features)
        print("Local dataset loaded successfully:")
        print(dataset_dict)

        # Verify the data column exists in the loaded dataset
        for split in dataset_dict:
             if args.data_column not in dataset_dict[split].column_names:
                  print(f"Error: Column '{args.data_column}' not found in loaded '{split}' split.")
                  print(f"Available columns: {dataset_dict[split].column_names}")
                  sys.exit(1)

    except Exception as e:
        print(f"Error loading dataset from CSV files: {e}")
        print("Please check file paths, CSV format, and column names.")
        sys.exit(1)

    # 6. Rename column if necessary (optional, often good to standardize to 'text')
    # If you always want the main data column to be named 'text' on the Hub:
    if args.data_column != 'text':
         print(f"Renaming column '{args.data_column}' to 'text'...")
         try:
             dataset_dict = dataset_dict.rename_column(args.data_column, "text")
             print("Column renamed successfully.")
             print(dataset_dict)
         except Exception as e:
             print(f"Error renaming column: {e}")
             # Decide if you want to exit or proceed with the original column name
             # sys.exit(1)


    # 7. Push to Hub
    print(f"\nAttempting to push dataset to Hub repository: {args.repo_id}...")
    try:
        dataset_dict.push_to_hub(
            repo_id=args.repo_id,
            private=args.private,
            token=token # Pass token explicitly
            )
        print("\n--- Upload Successful! ---")
        hub_url = f"https://huggingface.co/datasets/{args.repo_id}"
        print(f"Dataset available at: {hub_url}")

    except Exception as e:
        print(f"\n--- Error During Upload ---")
        print(f"An error occurred: {e}")
        print("Possible causes:")
        print("- Invalid Hugging Face token or insufficient permissions (needs write access).")
        print("- Repository ID format incorrect (should be 'username/dataset_name').")
        print("- Network issues.")
        print("- Git LFS not installed or properly configured.")
        print("- Conflicts if the repository already exists with incompatible content.")
        sys.exit(1)

if __name__ == "__main__":
    main()