Alyosha11
/

Phoneme

Model card Files Files and versions

Phoneme / 50k.py

Alyosha11's picture

Upload 50k.py with huggingface_hub

4b60ebe verified almost 2 years ago

history blame contribute delete

2.29 kB

	import os
	import pandas as pd
	from multiprocessing import Pool
	import time
	from tqdm import tqdm

	def process_rows(args):
	rows, output_directory = args
	for index, row in rows.iterrows():
	# Generate the output text file path
	text_filename = f"row_{index}.txt"
	text_file_path = os.path.join(output_directory, text_filename)

	# Write the row to a text file
	with open(text_file_path, 'w') as text_file:
	text_file.write(','.join(row.astype(str)))

	# Directory containing the CSV files
	csv_directory = "extracted_csv_files"

	# Number of text files to generate
	target_count = 50000

	# Get the list of CSV files in the directory
	csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith(".csv")]

	# Create a directory to store the extracted text files
	output_directory = "extracted_text_files_50k"
	os.makedirs(output_directory, exist_ok=True)

	# Initialize variables
	total_count = 0
	file_index = 0

	# Start the timer
	start_time = time.time()

	# Create a progress bar
	progress_bar = tqdm(total=target_count, unit='files')

	# Process CSV files until the target count is reached
	while total_count < target_count and file_index < len(csv_files):
	csv_file_path = csv_files[file_index]

	# Read the CSV file using pandas
	df = pd.read_csv(csv_file_path)

	# Get the number of rows in the CSV file
	num_rows = len(df)

	# Calculate the number of rows to extract from the current CSV file
	rows_to_extract = min(target_count - total_count, num_rows)

	# Extract the rows from the CSV file
	rows = df.iloc[:rows_to_extract]

	# Create a multiprocessing pool
	pool = Pool()

	# Process the rows in parallel
	pool.map(process_rows, [(rows, output_directory)])

	# Close the multiprocessing pool
	pool.close()
	pool.join()

	total_count += rows_to_extract
	file_index += 1

	# Update the progress bar
	progress_bar.update(rows_to_extract)

	# Close the progress bar
	progress_bar.close()

	# End the timer
	end_time = time.time()

	# Calculate the execution time
	execution_time = end_time - start_time

	print(f"\nGenerated {total_count} text files.")
	print(f"Execution time: {execution_time:.2f} seconds.")