Spaces:

vineet88
/

context-aware-safety-ml-api

Runtime error

Deploy standalone ML service

16f57d9 verified about 2 months ago

954 Bytes

	import re
	import unicodedata


	class TextNormalizer:
	url_pattern = re.compile(r"https?://\S+\|www\.\S+", re.IGNORECASE)
	mention_pattern = re.compile(r"@\w+")
	whitespace_pattern = re.compile(r"\s+")
	repeated_latin_pattern = re.compile(r"([A-Za-z])\1{2,}")
	zero_width_pattern = re.compile(r"[\u200b-\u200f\u2060\ufeff]")
	repeated_punctuation_pattern = re.compile(r"([!?.,])\1{2,}")

	def normalize(self, text: str) -> str:
	normalized = unicodedata.normalize("NFKC", text).strip()
	normalized = self.zero_width_pattern.sub("", normalized)
	normalized = self.url_pattern.sub("<URL>", normalized)
	normalized = self.mention_pattern.sub("<USER>", normalized)
	normalized = self.repeated_latin_pattern.sub(r"\1\1", normalized)
	normalized = self.repeated_punctuation_pattern.sub(r"\1\1", normalized)
	normalized = self.whitespace_pattern.sub(" ", normalized)
	return normalized