EditLens / preprocess.py
multimodalart's picture
multimodalart HF Staff
EditLens AI-editing scorer demo (RoBERTa-large, ZeroGPU)
a4440ba verified
Raw
History Blame Contribute Delete
1.16 kB
import re
import emoji
BOILERPLATE_STARTS = [
"Sure",
"Here",
"Abstract",
"Title",
"I'm happy to help",
"Certainly",
]
def normalize_whitespace(text):
return re.sub(r"\s+", " ", text).strip()
def normalize_emoji(text):
return emoji.demojize(text)
def remove_think_tag(text):
if "</think>" in text:
text = text.split("</think>")[1].strip()
return text
def remove_ai_header(text):
paragraphs = [p for p in text.split("\n") if p.strip()]
if len(paragraphs) == 0:
return text
first_paragraph = paragraphs[0]
first_paragraph = re.sub(r"^[^a-zA-Z0-9]*", "", first_paragraph)
first_paragraph = emoji.replace_emoji(first_paragraph, "")
if any(first_paragraph.startswith(phrase) for phrase in BOILERPLATE_STARTS):
if len(paragraphs) > 1:
text = "\n".join(paragraphs[1:])
return text
def clean_text(text):
text = normalize_emoji(text)
text = remove_think_tag(text)
text = remove_ai_header(text)
text = text.lower()
text = normalize_whitespace(text)
return text
def count_words(text):
return len(re.findall(r"\b\w+\b", text))