| from nltk.tokenize import wordpunct_tokenize as word_tokenize |
| from nltk.tokenize import sent_tokenize |
|
|
| import re |
| import six |
| import textwrap |
|
|
| _whitelist = r"[0-9a-z\,\.\/\<\>]+" |
| _regex = "0-9a-z\,\.\/\<\>" |
|
|
|
|
| def filter_by_lang_regex(text, ratio=0.7, regex="0-9a-z\,\.\/\<\>"): |
| candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text), flags=re.IGNORECASE).replace(" ", "") |
| text = text.replace(" ", "") |
|
|
| return (len(candidate_text) / len(text)) > ratio |
|
|
|
|
| def filter_by_num_tokens(text, gt=64): |
| return len(word_tokenize(text)) > gt |
|
|
|
|
| def filter_by_num_sents(text, gt=2): |
| return len(sent_tokenize(text)) > gt |
|
|
|
|
| def filter_by_steps(text): |
| return re.search('(step|mix all)', text, re.IGNORECASE) is not None |
|
|
|
|
| def filter_by_length(text, gt=40): |
| return len(text) > gt |
|
|
|
|
| def filter_by_item(item_list, gt=4): |
| return len(item_list) > gt |
|
|
|
|
| def chars_to_preserve(sentence, whitelist): |
| try: |
| tokenized = re.findall(whitelist, sentence, re.IGNORECASE) |
| return " ".join(tokenized) |
| except Exception as error: |
| print( |
| textwrap.dedent( |
| f""" |
| Bad characters range {whitelist}, |
| {error} |
| """ |
| ) |
| ) |
| raise |
|
|
|
|
| def normalizer(text, whitelist=r"[0-9a-z\,\.\/\<\>]+", do_lowercase=False): |
| if do_lowercase: |
| text = text.lower() |
|
|
| text = chars_to_preserve(text, whitelist=whitelist) |
| text = " ".join([word.strip() for word in text.split() if word.strip()]) |
| text = text.strip() |
|
|
| return text |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|