Spaces:

multimodalart
/

EditLens

Running on Zero

EditLens / preprocess.py

EditLens AI-editing scorer demo (RoBERTa-large, ZeroGPU)

a4440ba verified 8 days ago

1.16 kB

	import re

	import emoji


	BOILERPLATE_STARTS = [
	"Sure",
	"Here",
	"Abstract",
	"Title",
	"I'm happy to help",
	"Certainly",
	]


	def normalize_whitespace(text):
	return re.sub(r"\s+", " ", text).strip()


	def normalize_emoji(text):
	return emoji.demojize(text)


	def remove_think_tag(text):
	if "</think>" in text:
	text = text.split("</think>")[1].strip()
	return text


	def remove_ai_header(text):
	paragraphs = [p for p in text.split("\n") if p.strip()]
	if len(paragraphs) == 0:
	return text
	first_paragraph = paragraphs[0]
	first_paragraph = re.sub(r"^[^a-zA-Z0-9]*", "", first_paragraph)
	first_paragraph = emoji.replace_emoji(first_paragraph, "")
	if any(first_paragraph.startswith(phrase) for phrase in BOILERPLATE_STARTS):
	if len(paragraphs) > 1:
	text = "\n".join(paragraphs[1:])
	return text


	def clean_text(text):
	text = normalize_emoji(text)
	text = remove_think_tag(text)
	text = remove_ai_header(text)
	text = text.lower()
	text = normalize_whitespace(text)
	return text


	def count_words(text):
	return len(re.findall(r"\b\w+\b", text))