Spaces:
Sleeping
Sleeping
| """ | |
| Text file processing script | |
| This script processes all .txt files in a specified directory applying | |
| the following transformations: | |
| 1. Removes notes: Suppresses all text that appears after the "|" symbol | |
| in each line, including the "|" symbol itself. | |
| 2. Consolidates lines: Replaces line breaks that do NOT follow a period | |
| with a space, allowing text to flow continuously. | |
| 3. Preserves paragraphs: Keeps paragraphs intact by preserving empty lines | |
| (double line breaks). | |
| Processed files are saved with the same name preceded by an underscore "_". | |
| Usage: python process.py <directory> | |
| Example: python process.py ./data/raw | |
| """ | |
| import sys | |
| import re | |
| # Raises an error if the required arguments are not provided | |
| if len(sys.argv) != 2: | |
| print("Usage: python process.py <directory>") | |
| sys.exit(1) | |
| import os | |
| input_dir = sys.argv[1] | |
| # Verifies that the directory exists | |
| if not os.path.isdir(input_dir): | |
| print(f"The directory {input_dir} does not exist.") | |
| sys.exit(1) | |
| # Processes each .txt file in the directory | |
| for filename in os.listdir(input_dir): | |
| if filename.endswith(".txt"): | |
| input_path = os.path.join(input_dir, filename) | |
| output_path = os.path.join(input_dir, f"_{filename}") | |
| with open(input_path, 'r', encoding='utf-8') as infile: | |
| content = infile.read() | |
| # Removes notes (text after |) including the | | |
| content = re.sub(r'\|[^\n]*', '', content) | |
| # Replaces line breaks that do NOT follow a period | |
| # but preserves empty lines (double line breaks) | |
| processed_content = re.sub(r'(?<!\n)(?<!\.)\n(?!\n)', ' ', content) | |
| with open(output_path, 'w', encoding='utf-8') as outfile: | |
| outfile.write(processed_content) |