File size: 971 Bytes
c2760fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# preprocessing.py
import re
class Preprocessor:
def __init__(self, lowercase=False, separate_apostrophes=True, separate_digits=True, separate_punctuation=True):
self.lowercase = lowercase
self.separate_apostrophes = separate_apostrophes
self.separate_punctuation = separate_punctuation
self.separate_digits = separate_digits
def preprocess(self, line: str) -> str:
if self.lowercase:
line = line.lower()
if self.separate_apostrophes:
# Add spaces around apostrophes
line = re.sub(r"([’'`])", r" \1 ", line)
# Add spaces around punctuation (except alphanumeric and apostrophes)
if self.separate_punctuation:
line = re.sub(r"([^A-Za-z0-9\s’'`])", r" \1 ", line)
if self.separate_digits:
line = re.sub(r"(\d)", r" \1 ", line)
# Normalize whitespace
line = re.sub(r"\s+", " ", line)
return line.strip() |