Spaces:

imanerd
/

l2-demo

Sleeping

initial deploy

98b332e 11 days ago

1.4 kB

	"""Light text normalization. Intentionally non-destructive: punctuation is
	collapsed, not stripped, so VADER and downstream feature extractors still see
	exclamation marks, question marks, and repeated chars that carry signal.
	"""

	from __future__ import annotations

	import re
	import unicodedata

	_WS = re.compile(r"\s+")
	_REPEAT_PUNCT = re.compile(r"([!?.,])\1{2,}") # !!!! -> !
	_REPEAT_CHARS = re.compile(r"(.)\1{3,}") # sooooo -> soo (keep one duplicate)
	_DASHES = re.compile(r"[‐-―−]") # unicode dashes -> ascii dash
	_QUOTES = re.compile(r"[‘’‚‛]") # curly singles -> '
	_DQUOTES = re.compile(r"[“”„‟]") # curly doubles -> "


	def clean(text: str) -> str:
	if text is None:
	return ""
	t = unicodedata.normalize("NFKC", str(text))
	t = _DASHES.sub("-", t)
	t = _QUOTES.sub("'", t)
	t = _DQUOTES.sub('"', t)
	t = _REPEAT_PUNCT.sub(r"\1", t)
	t = _REPEAT_CHARS.sub(r"\1\1", t)
	t = _WS.sub(" ", t).strip()
	return t.lower()


	def clean_preserve_case(text: str) -> str:
	"""Same as clean() but keeps case. Useful for NER and caps-as-emotion."""
	if text is None:
	return ""
	t = unicodedata.normalize("NFKC", str(text))
	t = _DASHES.sub("-", t)
	t = _QUOTES.sub("'", t)
	t = _DQUOTES.sub('"', t)
	t = _REPEAT_PUNCT.sub(r"\1", t)
	t = _REPEAT_CHARS.sub(r"\1\1", t)
	return _WS.sub(" ", t).strip()