Spaces:
Running
Running
| import csv | |
| import os | |
| import sys | |
| import glob | |
| import tqdm | |
| def split_csv_files(input_files, output_dir, lines_per_file=100000): | |
| # Ensure output directory exists | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Initialize counters | |
| total_lines = 0 | |
| file_count = 0 | |
| current_line_count = 0 | |
| # Initialize the first output file | |
| output_file = os.path.join(output_dir, f"{str(file_count).zfill(3)}.csv") | |
| output_writer = open(output_file, "w", newline="") | |
| csv_writer = None | |
| try: | |
| for file_path in tqdm.tqdm(input_files, desc="Processing files"): | |
| with open(file_path, "r") as csv_file: | |
| csv_reader = csv.reader(csv_file) | |
| # Initialize writer once we have the header row | |
| if csv_writer is None: | |
| header = next(csv_reader) | |
| csv_writer = csv.writer(output_writer) | |
| csv_writer.writerow(header) | |
| # Process each line in the current file | |
| for row in csv_reader: | |
| if current_line_count >= lines_per_file: | |
| # Close the current file and start a new one | |
| output_writer.close() | |
| file_count += 1 | |
| current_line_count = 0 | |
| output_file = os.path.join( | |
| output_dir, f"{str(file_count).zfill(3)}.csv" | |
| ) | |
| output_writer = open(output_file, "w", newline="") | |
| csv_writer = csv.writer(output_writer) | |
| csv_writer.writerow(header) # Write header to new file | |
| # Write row to the current output file | |
| csv_writer.writerow(row) | |
| current_line_count += 1 | |
| total_lines += 1 | |
| finally: | |
| # Close the last output file | |
| if output_writer: | |
| output_writer.close() | |
| print(f"Total lines processed: {total_lines}") | |
| print(f"Files created: {file_count + 1}") | |
| if __name__ == "__main__": | |
| input_dir = "../datasets/YFCC100M/yfcc100m_dataset_with_gps_train" | |
| output_dir = "../datasets/YFCC100M/yfcc100m_dataset_with_gps_train_balanced" | |
| lines_per_file = 100000 | |
| # Get all CSV files in input directory | |
| input_files = glob.glob(os.path.join(input_dir, "*.csv")) | |
| if not input_files: | |
| print(f"No CSV files found in {input_dir}") | |
| sys.exit(1) | |
| print(f"Found {len(input_files)} CSV files") | |
| split_csv_files(input_files, output_dir, lines_per_file) | |