File size: 1,381 Bytes
9c37331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re

class Preprocessor:
    """
    A class for preprocessing text data.
    This class provides to clean and normalize text data.
    """

    @staticmethod
    def basic_preprocess(text):
        """
        Basic preprocessing of text data.
        - Converts to lowercase
        - Removes special characters and digits
        - Strips leading and trailing whitespace
        """    
        # Remove common strings like page numbers, arXiv mentions, etc.
        text = re.sub(r'Page \d+|arXiv preprint.*|Copyright.*', '', text, flags=re.IGNORECASE)

        # Merge single newlines within paragraphs, but keep double newlines as paragraph breaks
        text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

        # Remove hyphenations at line breaks (like "exam-\nple" -> "example")
        
        text = re.sub(r'-\s*\n', '', text)

        # First remove newline after hyphen, then remove just the hyphen if it remains
        text = re.sub(r'-\s+', '', text)

        # Normalize extra spaces
        text = re.sub(r'\s+', ' ', text)

        # Strip leading/trailing whitespace
        text = text.strip()

        return text
    
    def __call__(self, *args, **kwds):
        """
        Call method to apply basic preprocessing.
        This allows the class instance to be used as a function.
        """
        return self.basic_preprocess(*args, **kwds)