|
|
import pandas as pd |
|
|
import random |
|
|
import math |
|
|
import argparse |
|
|
import os |
|
|
|
|
|
def split_dataset(filename, train_ratio): |
|
|
df = pd.read_csv(filename) |
|
|
|
|
|
|
|
|
df = df.sample(frac=1, random_state=42).reset_index(drop=True) |
|
|
|
|
|
train_size = math.floor(len(df) * train_ratio) |
|
|
remaining_data = df.iloc[train_size:] |
|
|
|
|
|
val_size = test_size = math.floor(len(remaining_data) / 2) |
|
|
|
|
|
train_data = df.iloc[:train_size] |
|
|
val_data = remaining_data.iloc[:val_size] |
|
|
test_data = remaining_data.iloc[val_size:] |
|
|
|
|
|
base_filename = os.path.splitext(os.path.basename(filename))[0] |
|
|
output_dir = os.path.dirname(filename) |
|
|
|
|
|
train_filename = os.path.join(output_dir, base_filename + '_train.csv') |
|
|
val_filename = os.path.join(output_dir, base_filename + '_val.csv') |
|
|
test_filename = os.path.join(output_dir, base_filename + '_test.csv') |
|
|
|
|
|
train_data.to_csv(train_filename, index=False) |
|
|
val_data.to_csv(val_filename, index=False) |
|
|
test_data.to_csv(test_filename, index=False) |
|
|
|
|
|
print("Dataset split completed.") |
|
|
print("Train data saved as:", train_filename) |
|
|
print("Validation data saved as:", val_filename) |
|
|
print("Test data saved as:", test_filename) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
parser = argparse.ArgumentParser(description='Split a CSV dataset into train, validation, and test sets.') |
|
|
parser.add_argument('filename', type=str, help='the filename of the CSV dataset') |
|
|
parser.add_argument('train_ratio', type=float, help='the split ratio for the train set') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
split_dataset(args.filename, args.train_ratio) |
|
|
|