|
|
import re, os, unicodedata |
|
|
import sys |
|
|
from config import PROJECT_ROOT, greekwords |
|
|
|
|
|
class Preprocessor: |
|
|
def __init__(self): |
|
|
pass |
|
|
|
|
|
def process(self, text): |
|
|
meditations = unicodedata.normalize("NFC", text) |
|
|
|
|
|
|
|
|
is_original = "THE FIRST BOOK" in meditations and "APPENDIX" in meditations |
|
|
|
|
|
if is_original: |
|
|
startindex = meditations.index("THE FIRST BOOK") |
|
|
meditations = meditations[startindex:] |
|
|
endindex = meditations.index("APPENDIX") |
|
|
meditations = meditations[:endindex] |
|
|
|
|
|
book_name = r"THE\s+[A-Z]+\s+BOOK\s+[IVXLCDM]+\.\s" |
|
|
section_name = r"\n\n[IVXLCDM]+\. " |
|
|
book_end = r"\n\n\n\n" |
|
|
|
|
|
meditations = re.sub(book_name, "<BEGIN> \n", meditations) |
|
|
meditations = re.sub(book_end, "<END> ", meditations) |
|
|
meditations = re.sub(section_name, "\n <END> \n <BEGIN> \n", meditations) |
|
|
|
|
|
split_pattern = f"{book_name}|{section_name}" |
|
|
else: |
|
|
|
|
|
split_pattern = r"\n\n" |
|
|
|
|
|
underline = r"[_]+" |
|
|
newline_in_sentence = r"(?<!\n)\n(?!\n)" |
|
|
|
|
|
meditations = re.sub(underline, "", meditations) |
|
|
meditations = re.sub(newline_in_sentence, " ", meditations) |
|
|
|
|
|
for key, value in greekwords.items(): |
|
|
meditations = meditations.replace(key, value) |
|
|
|
|
|
raw_sections = re.split(split_pattern, meditations) |
|
|
|
|
|
final_sections = [] |
|
|
for section in raw_sections: |
|
|
if not section.strip(): |
|
|
continue |
|
|
sentences = self.split_into_sentences(section) |
|
|
if sentences: |
|
|
final_sections.extend(sentences) |
|
|
|
|
|
return meditations, final_sections |
|
|
|
|
|
def split_into_sentences(self, text: str) -> list[str]: |
|
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
return [s.strip() for s in sentences if s.strip()] |
|
|
|
|
|
def test(self, file): |
|
|
processed = None |
|
|
if file: |
|
|
try: |
|
|
processed = self.process(file) |
|
|
except Exception: |
|
|
print("The processed file is not compliant with preprocess' requirements. Falling back to default file.\n") |
|
|
processed = None |
|
|
if not processed: |
|
|
test_file_path = os.path.join(os.path.dirname(__file__), "preprocess_test.txt") |
|
|
with open(test_file_path, "r") as f: |
|
|
processed = self.process(f.read()) |
|
|
|
|
|
output_file_path = os.path.join(os.path.dirname(__file__), "preprocess_test_output.txt") |
|
|
with open(output_file_path, "w") as f: |
|
|
f.write(processed[0]) |
|
|
|
|
|
print(f"Saved to {output_file_path}.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
file = None |
|
|
if len(sys.argv) > 1: |
|
|
test = sys.argv[1] |
|
|
if test != "test": |
|
|
print("Only permitted argument is 'test'; Please try again.") |
|
|
pass |
|
|
|
|
|
else: |
|
|
print("Preprocessing logic is wrapped into overall training functionality.") |
|
|
pass |
|
|
|
|
|
if len(sys.argv) > 2: |
|
|
filepath = sys.argv[2] |
|
|
try: |
|
|
with open(filepath, "r") as f: |
|
|
file = f.read() |
|
|
except Exception as e: |
|
|
print("Invalid filepath, falling back to original test.") |
|
|
file = None |
|
|
|
|
|
Preprocessor().test(file) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|