| | from nltk.tokenize import wordpunct_tokenize as word_tokenize |
| | from nltk.tokenize import sent_tokenize |
| |
|
| | import re |
| | import six |
| | import textwrap |
| |
|
| | _whitelist = r"[0-9a-z\,\.\/\<\>]+" |
| | _regex = "0-9a-z\,\.\/\<\>" |
| |
|
| |
|
| | def filter_by_lang_regex(text, ratio=0.7, regex="0-9a-z\,\.\/\<\>"): |
| | candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text), flags=re.IGNORECASE).replace(" ", "") |
| | text = text.replace(" ", "") |
| |
|
| | return (len(candidate_text) / len(text)) > ratio |
| |
|
| |
|
| | def filter_by_num_tokens(text, gt=64): |
| | return len(word_tokenize(text)) > gt |
| |
|
| |
|
| | def filter_by_num_sents(text, gt=2): |
| | return len(sent_tokenize(text)) > gt |
| |
|
| |
|
| | def filter_by_steps(text): |
| | return re.search('(step|mix all)', text, re.IGNORECASE) is not None |
| |
|
| |
|
| | def filter_by_length(text, gt=40): |
| | return len(text) > gt |
| |
|
| |
|
| | def filter_by_item(item_list, gt=4): |
| | return len(item_list) > gt |
| |
|
| |
|
| | def chars_to_preserve(sentence, whitelist): |
| | try: |
| | tokenized = re.findall(whitelist, sentence, re.IGNORECASE) |
| | return " ".join(tokenized) |
| | except Exception as error: |
| | print( |
| | textwrap.dedent( |
| | f""" |
| | Bad characters range {whitelist}, |
| | {error} |
| | """ |
| | ) |
| | ) |
| | raise |
| |
|
| |
|
| | def normalizer(text, whitelist=r"[0-9a-z\,\.\/\<\>]+", do_lowercase=False): |
| | if do_lowercase: |
| | text = text.lower() |
| |
|
| | text = chars_to_preserve(text, whitelist=whitelist) |
| | text = " ".join([word.strip() for word in text.split() if word.strip()]) |
| | text = text.strip() |
| |
|
| | return text |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|