import re import emoji BOILERPLATE_STARTS = [ "Sure", "Here", "Abstract", "Title", "I'm happy to help", "Certainly", ] def normalize_whitespace(text): return re.sub(r"\s+", " ", text).strip() def normalize_emoji(text): return emoji.demojize(text) def remove_think_tag(text): if "" in text: text = text.split("")[1].strip() return text def remove_ai_header(text): paragraphs = [p for p in text.split("\n") if p.strip()] if len(paragraphs) == 0: return text first_paragraph = paragraphs[0] first_paragraph = re.sub(r"^[^a-zA-Z0-9]*", "", first_paragraph) first_paragraph = emoji.replace_emoji(first_paragraph, "") if any(first_paragraph.startswith(phrase) for phrase in BOILERPLATE_STARTS): if len(paragraphs) > 1: text = "\n".join(paragraphs[1:]) return text def clean_text(text): text = normalize_emoji(text) text = remove_think_tag(text) text = remove_ai_header(text) text = text.lower() text = normalize_whitespace(text) return text def count_words(text): return len(re.findall(r"\b\w+\b", text))