taxdoc-preprocessor / cleaner.py
iamnew123's picture
Upload 5 files
7780d69 verified
raw
history blame contribute delete
486 Bytes
import re
def clean_text(text):
lines = text.splitlines()
cleaned = []
for line in lines:
line = line.strip()
if not line or len(line) < 5:
continue
if re.search(r'(page \\d+|www\\.|linkedin|facebook|youtube|subscribe|@)', line, re.IGNORECASE):
continue
if line.isupper() and len(line) < 40:
continue
line = re.sub(r'\\s{2,}', ' ', line)
cleaned.append(line)
return "\n".join(cleaned)