Spaces:
Running on Zero
Running on Zero
| import re | |
| import emoji | |
| BOILERPLATE_STARTS = [ | |
| "Sure", | |
| "Here", | |
| "Abstract", | |
| "Title", | |
| "I'm happy to help", | |
| "Certainly", | |
| ] | |
| def normalize_whitespace(text): | |
| return re.sub(r"\s+", " ", text).strip() | |
| def normalize_emoji(text): | |
| return emoji.demojize(text) | |
| def remove_think_tag(text): | |
| if "</think>" in text: | |
| text = text.split("</think>")[1].strip() | |
| return text | |
| def remove_ai_header(text): | |
| paragraphs = [p for p in text.split("\n") if p.strip()] | |
| if len(paragraphs) == 0: | |
| return text | |
| first_paragraph = paragraphs[0] | |
| first_paragraph = re.sub(r"^[^a-zA-Z0-9]*", "", first_paragraph) | |
| first_paragraph = emoji.replace_emoji(first_paragraph, "") | |
| if any(first_paragraph.startswith(phrase) for phrase in BOILERPLATE_STARTS): | |
| if len(paragraphs) > 1: | |
| text = "\n".join(paragraphs[1:]) | |
| return text | |
| def clean_text(text): | |
| text = normalize_emoji(text) | |
| text = remove_think_tag(text) | |
| text = remove_ai_header(text) | |
| text = text.lower() | |
| text = normalize_whitespace(text) | |
| return text | |
| def count_words(text): | |
| return len(re.findall(r"\b\w+\b", text)) | |