File size: 1,735 Bytes
4e5fc16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""
Text file processing script

This script processes all .txt files in a specified directory applying
the following transformations:

1. Removes notes: Suppresses all text that appears after the "|" symbol 
   in each line, including the "|" symbol itself.

2. Consolidates lines: Replaces line breaks that do NOT follow a period 
   with a space, allowing text to flow continuously.

3. Preserves paragraphs: Keeps paragraphs intact by preserving empty lines 
   (double line breaks).

Processed files are saved with the same name preceded by an underscore "_".

Usage: python process.py <directory>
Example: python process.py ./data/raw
"""

import sys
import re
# Raises an error if the required arguments are not provided
if len(sys.argv) != 2:
    print("Usage: python process.py <directory>")
    sys.exit(1)
import os
input_dir = sys.argv[1]
# Verifies that the directory exists
if not os.path.isdir(input_dir):
    print(f"The directory {input_dir} does not exist.")
    sys.exit(1)
# Processes each .txt file in the directory
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(input_dir, f"_{filename}")
        with open(input_path, 'r', encoding='utf-8') as infile:
            content = infile.read()
        # Removes notes (text after |) including the |
        content = re.sub(r'\|[^\n]*', '', content)
        # Replaces line breaks that do NOT follow a period
        # but preserves empty lines (double line breaks)
        processed_content = re.sub(r'(?<!\n)(?<!\.)\n(?!\n)', ' ', content)
        with open(output_path, 'w', encoding='utf-8') as outfile:
            outfile.write(processed_content)