PyTorch
gpt2
File size: 971 Bytes
c2760fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# preprocessing.py

import re

class Preprocessor:
    def __init__(self, lowercase=False, separate_apostrophes=True, separate_digits=True, separate_punctuation=True):
        self.lowercase = lowercase
        self.separate_apostrophes = separate_apostrophes
        self.separate_punctuation = separate_punctuation
        self.separate_digits = separate_digits

    def preprocess(self, line: str) -> str:
        if self.lowercase:
            line = line.lower()
        if self.separate_apostrophes:
            # Add spaces around apostrophes
            line = re.sub(r"([’'`])", r" \1 ", line)
        # Add spaces around punctuation (except alphanumeric and apostrophes)
        if self.separate_punctuation:
            line = re.sub(r"([^A-Za-z0-9\s’'`])", r" \1 ", line)
        if self.separate_digits:
            line = re.sub(r"(\d)", r" \1 ", line)

        # Normalize whitespace
        line = re.sub(r"\s+", " ", line)
        return line.strip()