francis-botcon / src /process.py
Rojaldo
Initialize Francis Botcon Gradio Space with model files
4e5fc16
"""
Text file processing script
This script processes all .txt files in a specified directory applying
the following transformations:
1. Removes notes: Suppresses all text that appears after the "|" symbol
in each line, including the "|" symbol itself.
2. Consolidates lines: Replaces line breaks that do NOT follow a period
with a space, allowing text to flow continuously.
3. Preserves paragraphs: Keeps paragraphs intact by preserving empty lines
(double line breaks).
Processed files are saved with the same name preceded by an underscore "_".
Usage: python process.py <directory>
Example: python process.py ./data/raw
"""
import sys
import re
# Raises an error if the required arguments are not provided
if len(sys.argv) != 2:
print("Usage: python process.py <directory>")
sys.exit(1)
import os
input_dir = sys.argv[1]
# Verifies that the directory exists
if not os.path.isdir(input_dir):
print(f"The directory {input_dir} does not exist.")
sys.exit(1)
# Processes each .txt file in the directory
for filename in os.listdir(input_dir):
if filename.endswith(".txt"):
input_path = os.path.join(input_dir, filename)
output_path = os.path.join(input_dir, f"_{filename}")
with open(input_path, 'r', encoding='utf-8') as infile:
content = infile.read()
# Removes notes (text after |) including the |
content = re.sub(r'\|[^\n]*', '', content)
# Replaces line breaks that do NOT follow a period
# but preserves empty lines (double line breaks)
processed_content = re.sub(r'(?<!\n)(?<!\.)\n(?!\n)', ' ', content)
with open(output_path, 'w', encoding='utf-8') as outfile:
outfile.write(processed_content)