File size: 3,390 Bytes
371296c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
## these methods just provide some simple tidying of human and model annotations
def remove_double_spaces_and_break_characters(given_text: str) -> str:
'''
This will simplify a text by removing double spaces and all whitespace characters (e.g. space, tab, newline, return, formfeed).
See https://stackoverflow.com/a/1546251/8633026
This may or may not be desirable and should only be used at the end of preprocessing as it removes important characters like \n.
:param given_text:
:return:
'''
if given_text:
return " ".join(given_text.split())
else:
return given_text
def leading_trailing_whitespace(given_str: str):
"""
Remove leading and trailing whitespace from a given string.
:param given_str: The input string.
:return: The input string with leading and trailing whitespace removed.
"""
try:
if given_str != given_str or given_str is None:
return given_str
else:
stripped = given_str.strip()
return stripped
except AttributeError:
return given_str
def leading_trailing_punctuation(given_str: str):
"""
:param given_str: The input string which may contain leading and trailing punctuation.
:return: The input string with leading and trailing punctuation removed.
"""
try:
if given_str != given_str or given_str is None:
return given_str
else:
stripped = given_str.strip('!"#$%&\'()*,-./:;<=>?@[\\]^_`{|}~')
return stripped
except AttributeError:
return given_str
def lowercase(given_str: str):
"""
Convert the given string to lowercase.
:param given_str: The string to be converted to lowercase.
:return: The lowercase version of the given string.
"""
try:
if given_str != given_str or given_str is None:
return given_str
else:
return given_str.lower()
except AttributeError:
return given_str
def clean_taxon_strings(given_str: str):
"""
Clean the given string by removing leading/trailing whitespace,
leading/trailing punctuation, and converting all characters to lowercase.
Also remove double spaces and break characters, as with files that are passed to LLM models.
A clean string should be retrievable from the original text when all lower case.
:param given_str: The string to be cleaned.
:return: The cleaned string.
"""
low = lowercase(given_str)
while (leading_trailing_whitespace(low) != low) or (leading_trailing_punctuation(low) != low):
low = leading_trailing_whitespace(low)
low = leading_trailing_punctuation(low)
return remove_double_spaces_and_break_characters(low)
def clean_compound_strings(given_str: str):
"""
Clean the given string by removing leading/trailing whitespace and converting all characters to lowercase.
Also remove double spaces and break characters, as with files that are passed to LLM models.
A clean string should be retrievable from the original text when all lower case.
:param given_str: The string to be cleaned.
:return: The cleaned string.
"""
low = lowercase(given_str)
while (leading_trailing_whitespace(low) != low):
low = leading_trailing_whitespace(low)
return remove_double_spaces_and_break_characters(low)
|