| import os |
| import re |
| import unicodedata |
|
|
| def strip_accents(text: str) -> str: |
| """Removes accents from text.""" |
| return ''.join(c for c in unicodedata.normalize('NFD', text) |
| if unicodedata.category(c) != 'Mn') |
|
|
|
|
| def load_raw_text(corpus_directory: str, file_names=None) -> str: |
| """Loads all the text files in a directory into one large string""" |
| corpus = "" |
| |
| for file_name in os.listdir(corpus_directory): |
| |
| file_path = os.path.join(corpus_directory, file_name) |
| if os.path.isdir(file_path): |
| continue |
| |
| |
| if ".txt" not in file_name: |
| continue |
| |
| with open(file_path, 'r') as file: |
| file_contents = file.read() |
| corpus += (file_contents + "\n") |
| return corpus |
|
|
| def load_single_raw_text_file(file_name): |
| """Loads a single text file into one large string""" |
|
|
| corpus = "" |
| with open(file_name, 'r') as file: |
| file_contents = file.read() |
| corpus += (file_contents + "\n") |
|
|
| return corpus |
|
|
|
|
| word_regex = r"[\w|\']+" |
| def tokenize(text): |
| return re.findall(word_regex, text) |
|
|
|
|
| def preprocess(text): |
| """Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation.""" |
| text = strip_accents(text) |
| text = text.lower() |
|
|
| tokens = text.split(" ") |
|
|
| tokens_filtered = [] |
| for token in tokens: |
| |
| if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token): |
| tokens_filtered.append(token) |
| return tokens_filtered |
|
|
|
|
| def pad(text: list, num_padding: int): |
| """Pads the given text, as a list of strings, with <s> characters between sentences.""" |
| padded_text = [] |
| |
| |
| for _ in range(num_padding): |
| padded_text.append("<s>") |
| |
| for word in text: |
| padded_text.append(word) |
|
|
| |
| |
| if word in [".", "?", "!"]: |
| for _ in range(num_padding): |
| padded_text.append("<s>") |
| |
| |
| return padded_text |
|
|