""" Text file processing script This script processes all .txt files in a specified directory applying the following transformations: 1. Removes notes: Suppresses all text that appears after the "|" symbol in each line, including the "|" symbol itself. 2. Consolidates lines: Replaces line breaks that do NOT follow a period with a space, allowing text to flow continuously. 3. Preserves paragraphs: Keeps paragraphs intact by preserving empty lines (double line breaks). Processed files are saved with the same name preceded by an underscore "_". Usage: python process.py Example: python process.py ./data/raw """ import sys import re # Raises an error if the required arguments are not provided if len(sys.argv) != 2: print("Usage: python process.py ") sys.exit(1) import os input_dir = sys.argv[1] # Verifies that the directory exists if not os.path.isdir(input_dir): print(f"The directory {input_dir} does not exist.") sys.exit(1) # Processes each .txt file in the directory for filename in os.listdir(input_dir): if filename.endswith(".txt"): input_path = os.path.join(input_dir, filename) output_path = os.path.join(input_dir, f"_{filename}") with open(input_path, 'r', encoding='utf-8') as infile: content = infile.read() # Removes notes (text after |) including the | content = re.sub(r'\|[^\n]*', '', content) # Replaces line breaks that do NOT follow a period # but preserves empty lines (double line breaks) processed_content = re.sub(r'(?