Spaces:
Sleeping
Sleeping
File size: 1,735 Bytes
4e5fc16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
"""
Text file processing script
This script processes all .txt files in a specified directory applying
the following transformations:
1. Removes notes: Suppresses all text that appears after the "|" symbol
in each line, including the "|" symbol itself.
2. Consolidates lines: Replaces line breaks that do NOT follow a period
with a space, allowing text to flow continuously.
3. Preserves paragraphs: Keeps paragraphs intact by preserving empty lines
(double line breaks).
Processed files are saved with the same name preceded by an underscore "_".
Usage: python process.py <directory>
Example: python process.py ./data/raw
"""
import sys
import re
# Raises an error if the required arguments are not provided
if len(sys.argv) != 2:
print("Usage: python process.py <directory>")
sys.exit(1)
import os
input_dir = sys.argv[1]
# Verifies that the directory exists
if not os.path.isdir(input_dir):
print(f"The directory {input_dir} does not exist.")
sys.exit(1)
# Processes each .txt file in the directory
for filename in os.listdir(input_dir):
if filename.endswith(".txt"):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(input_dir, f"_{filename}")
with open(input_path, 'r', encoding='utf-8') as infile:
content = infile.read()
# Removes notes (text after |) including the |
content = re.sub(r'\|[^\n]*', '', content)
# Replaces line breaks that do NOT follow a period
# but preserves empty lines (double line breaks)
processed_content = re.sub(r'(?<!\n)(?<!\.)\n(?!\n)', ' ', content)
with open(output_path, 'w', encoding='utf-8') as outfile:
outfile.write(processed_content) |