# preprocessing.py import re class Preprocessor: def __init__(self, lowercase=False, separate_apostrophes=True, separate_digits=True, separate_punctuation=True): self.lowercase = lowercase self.separate_apostrophes = separate_apostrophes self.separate_punctuation = separate_punctuation self.separate_digits = separate_digits def preprocess(self, line: str) -> str: if self.lowercase: line = line.lower() if self.separate_apostrophes: # Add spaces around apostrophes line = re.sub(r"([’'`])", r" \1 ", line) # Add spaces around punctuation (except alphanumeric and apostrophes) if self.separate_punctuation: line = re.sub(r"([^A-Za-z0-9\s’'`])", r" \1 ", line) if self.separate_digits: line = re.sub(r"(\d)", r" \1 ", line) # Normalize whitespace line = re.sub(r"\s+", " ", line) return line.strip()