Spaces:
Sleeping
Sleeping
| import re | |
| class Preprocessor: | |
| """ | |
| A class for preprocessing text data. | |
| This class provides to clean and normalize text data. | |
| """ | |
| def basic_preprocess(text): | |
| """ | |
| Basic preprocessing of text data. | |
| - Converts to lowercase | |
| - Removes special characters and digits | |
| - Strips leading and trailing whitespace | |
| """ | |
| # Remove common strings like page numbers, arXiv mentions, etc. | |
| text = re.sub(r'Page \d+|arXiv preprint.*|Copyright.*', '', text, flags=re.IGNORECASE) | |
| # Merge single newlines within paragraphs, but keep double newlines as paragraph breaks | |
| text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text) | |
| # Remove hyphenations at line breaks (like "exam-\nple" -> "example") | |
| text = re.sub(r'-\s*\n', '', text) | |
| # First remove newline after hyphen, then remove just the hyphen if it remains | |
| text = re.sub(r'-\s+', '', text) | |
| # Normalize extra spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| # Strip leading/trailing whitespace | |
| text = text.strip() | |
| return text | |
| def __call__(self, *args, **kwds): | |
| """ | |
| Call method to apply basic preprocessing. | |
| This allows the class instance to be used as a function. | |
| """ | |
| return self.basic_preprocess(*args, **kwds) | |