PyTorch
gpt2
gpt2-10M-parfind-eng / preprocessing.py
achille-fusco's picture
Upload folder using huggingface_hub
c2760fe verified
raw
history blame contribute delete
971 Bytes
# preprocessing.py
import re
class Preprocessor:
def __init__(self, lowercase=False, separate_apostrophes=True, separate_digits=True, separate_punctuation=True):
self.lowercase = lowercase
self.separate_apostrophes = separate_apostrophes
self.separate_punctuation = separate_punctuation
self.separate_digits = separate_digits
def preprocess(self, line: str) -> str:
if self.lowercase:
line = line.lower()
if self.separate_apostrophes:
# Add spaces around apostrophes
line = re.sub(r"([’'`])", r" \1 ", line)
# Add spaces around punctuation (except alphanumeric and apostrophes)
if self.separate_punctuation:
line = re.sub(r"([^A-Za-z0-9\s’'`])", r" \1 ", line)
if self.separate_digits:
line = re.sub(r"(\d)", r" \1 ", line)
# Normalize whitespace
line = re.sub(r"\s+", " ", line)
return line.strip()