Spaces:
Runtime error
Runtime error
| import re | |
| def clean_text(text): | |
| # html pre-proc | |
| reg = re.compile(r'<.*?>') | |
| cleaned = reg.sub('', text) | |
| # cleaned = re.sub(r'\s([?.!"](?:\s|$))', r'\1', cleaned) | |
| cleaned = re.sub(r'\([^)]*\)', '', cleaned) | |
| # reg = re.compile(r'[\n\r\t]') | |
| # cleaned = reg.sub(" ", cleaned) | |
| # cleaned = re.sub('\.(?!$)', '', cleaned) # remove periods in between sentence | |
| cleaned = re.sub(r"(\w)([A-Z]+)", r'.', cleaned) | |
| cleaned = cleaned.strip() | |
| cleaned = cleaned.lstrip() | |
| cleaned = "".join(ch for ch in cleaned if unicodedata.category(ch)[0]!="C") | |
| cleaned = re.sub(' +', ' ', cleaned) | |
| cleaned = cleaned.replace(";", ", and") | |
| cleaned = cleaned.replace(":", "") | |
| cleaned = cleaned.replace(" .", ".") | |
| cleaned = cleaned.replace(" ,", ",") | |
| cleaned = cleaned.replace("\xa0", " ") | |
| cleaned = cleaned.lstrip('0123456789.- ') # remove nums at start | |
| cleaned = re.sub(r'\b(\w+)( \1\b)+', r'\1', cleaned) #remove repeated consecutive words | |
| # cleaned = cleaned.strip() | |
| return cleaned |