| # preprocessing.py | |
| import re | |
| class Preprocessor: | |
| def __init__(self, lowercase=False, separate_apostrophes=True, separate_digits=True, separate_punctuation=True): | |
| self.lowercase = lowercase | |
| self.separate_apostrophes = separate_apostrophes | |
| self.separate_punctuation = separate_punctuation | |
| self.separate_digits = separate_digits | |
| def preprocess(self, line: str) -> str: | |
| if self.lowercase: | |
| line = line.lower() | |
| if self.separate_apostrophes: | |
| # Add spaces around apostrophes | |
| line = re.sub(r"([’'`])", r" \1 ", line) | |
| # Add spaces around punctuation (except alphanumeric and apostrophes) | |
| if self.separate_punctuation: | |
| line = re.sub(r"([^A-Za-z0-9\s’'`])", r" \1 ", line) | |
| if self.separate_digits: | |
| line = re.sub(r"(\d)", r" \1 ", line) | |
| # Normalize whitespace | |
| line = re.sub(r"\s+", " ", line) | |
| return line.strip() |