| import os |
| import pandas as pd |
| from multiprocessing import Pool |
| import time |
| from tqdm import tqdm |
|
|
| def process_rows(args): |
| rows, output_directory = args |
| for index, row in rows.iterrows(): |
| |
| text_filename = f"row_{index}.txt" |
| text_file_path = os.path.join(output_directory, text_filename) |
| |
| |
| with open(text_file_path, 'w') as text_file: |
| text_file.write(','.join(row.astype(str))) |
|
|
| |
| csv_directory = "extracted_csv_files" |
|
|
| |
| target_count = 50000 |
|
|
| |
| csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith(".csv")] |
|
|
| |
| output_directory = "extracted_text_files_50k" |
| os.makedirs(output_directory, exist_ok=True) |
|
|
| |
| total_count = 0 |
| file_index = 0 |
|
|
| |
| start_time = time.time() |
|
|
| |
| progress_bar = tqdm(total=target_count, unit='files') |
|
|
| |
| while total_count < target_count and file_index < len(csv_files): |
| csv_file_path = csv_files[file_index] |
| |
| |
| df = pd.read_csv(csv_file_path) |
| |
| |
| num_rows = len(df) |
| |
| |
| rows_to_extract = min(target_count - total_count, num_rows) |
| |
| |
| rows = df.iloc[:rows_to_extract] |
| |
| |
| pool = Pool() |
| |
| |
| pool.map(process_rows, [(rows, output_directory)]) |
| |
| |
| pool.close() |
| pool.join() |
| |
| total_count += rows_to_extract |
| file_index += 1 |
| |
| |
| progress_bar.update(rows_to_extract) |
|
|
| |
| progress_bar.close() |
|
|
| |
| end_time = time.time() |
|
|
| |
| execution_time = end_time - start_time |
|
|
| print(f"\nGenerated {total_count} text files.") |
| print(f"Execution time: {execution_time:.2f} seconds.") |
|
|