Spaces:
Running on Zero
Running on Zero
File size: 1,162 Bytes
a4440ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | import re
import emoji
BOILERPLATE_STARTS = [
"Sure",
"Here",
"Abstract",
"Title",
"I'm happy to help",
"Certainly",
]
def normalize_whitespace(text):
return re.sub(r"\s+", " ", text).strip()
def normalize_emoji(text):
return emoji.demojize(text)
def remove_think_tag(text):
if "</think>" in text:
text = text.split("</think>")[1].strip()
return text
def remove_ai_header(text):
paragraphs = [p for p in text.split("\n") if p.strip()]
if len(paragraphs) == 0:
return text
first_paragraph = paragraphs[0]
first_paragraph = re.sub(r"^[^a-zA-Z0-9]*", "", first_paragraph)
first_paragraph = emoji.replace_emoji(first_paragraph, "")
if any(first_paragraph.startswith(phrase) for phrase in BOILERPLATE_STARTS):
if len(paragraphs) > 1:
text = "\n".join(paragraphs[1:])
return text
def clean_text(text):
text = normalize_emoji(text)
text = remove_think_tag(text)
text = remove_ai_header(text)
text = text.lower()
text = normalize_whitespace(text)
return text
def count_words(text):
return len(re.findall(r"\b\w+\b", text))
|