ScholarBot / src /preprocess.py
vinny4's picture
initial commit
9c37331
import re
class Preprocessor:
"""
A class for preprocessing text data.
This class provides to clean and normalize text data.
"""
@staticmethod
def basic_preprocess(text):
"""
Basic preprocessing of text data.
- Converts to lowercase
- Removes special characters and digits
- Strips leading and trailing whitespace
"""
# Remove common strings like page numbers, arXiv mentions, etc.
text = re.sub(r'Page \d+|arXiv preprint.*|Copyright.*', '', text, flags=re.IGNORECASE)
# Merge single newlines within paragraphs, but keep double newlines as paragraph breaks
text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
# Remove hyphenations at line breaks (like "exam-\nple" -> "example")
text = re.sub(r'-\s*\n', '', text)
# First remove newline after hyphen, then remove just the hyphen if it remains
text = re.sub(r'-\s+', '', text)
# Normalize extra spaces
text = re.sub(r'\s+', ' ', text)
# Strip leading/trailing whitespace
text = text.strip()
return text
def __call__(self, *args, **kwds):
"""
Call method to apply basic preprocessing.
This allows the class instance to be used as a function.
"""
return self.basic_preprocess(*args, **kwds)