import re class Preprocessor: """ A class for preprocessing text data. This class provides to clean and normalize text data. """ @staticmethod def basic_preprocess(text): """ Basic preprocessing of text data. - Converts to lowercase - Removes special characters and digits - Strips leading and trailing whitespace """ # Remove common strings like page numbers, arXiv mentions, etc. text = re.sub(r'Page \d+|arXiv preprint.*|Copyright.*', '', text, flags=re.IGNORECASE) # Merge single newlines within paragraphs, but keep double newlines as paragraph breaks text = re.sub(r'(? "example") text = re.sub(r'-\s*\n', '', text) # First remove newline after hyphen, then remove just the hyphen if it remains text = re.sub(r'-\s+', '', text) # Normalize extra spaces text = re.sub(r'\s+', ' ', text) # Strip leading/trailing whitespace text = text.strip() return text def __call__(self, *args, **kwds): """ Call method to apply basic preprocessing. This allows the class instance to be used as a function. """ return self.basic_preprocess(*args, **kwds)