Spaces:

asaf1602
/

sloganAI

Sleeping

App Files Files Community

sloganAI / logic /cleaning.py

asaf1602

Upload folder using huggingface_hub

b8397a5 verified 8 months ago

raw

history blame contribute delete

2.87 kB

	\
	import pandas as pd
	import re, unicodedata
	from html import unescape

	MIN_LEN = 20
	MAX_LEN = 60
	KEEP_ASCII_ONLY = False
	MIN_ALPHA_RATIO = 0.60
	DROP_IF_ALL_CAPS = False

	BUZZY = {
	"synergy","cutting edge","cutting-edge","best in class","best-in-class",
	"world class","world-class","state of the art","state-of-the-art",
	"revolutionary","disruptive platform","next generation","next-gen",
	"leading provider","scalable solution"
	}

	URL_RE = re.compile(r"(https?://\|www\.)\S+", re.I)
	EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
	PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
	WS_RE = re.compile(r"\s+")
	PUNCT_RE = re.compile(r"[^\w\s]+")
	TM_RE = re.compile(r"[®️©️™️]")

	def _nfkc(s): return unicodedata.normalize("NFKC", s)

	def _clean_text(s: str) -> str:
	s = "" if s is None else str(s)
	s = unescape(s)
	s = _nfkc(s)
	s = s.replace("\\n"," ").replace("\\r"," ")
	s = TM_RE.sub("", s)
	s = WS_RE.sub(" ", s).strip()
	return s

	def _alpha_ratio(s: str) -> float:
	if not s: return 0.0
	letters = sum(ch.isalpha() for ch in s)
	return letters / max(1, len(s))

	def _looks_shouty(s: str) -> bool:
	letters = [ch for ch in s if ch.isalpha()]
	if not letters: return False
	uppers = sum(ch.isupper() for ch in letters)
	return uppers / len(letters) >= 0.85

	def _contains_buzzy(s: str) -> bool:
	lo = s.lower()
	return any(term in lo for term in BUZZY)

	def _has_junk(s: str) -> bool:
	return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s))

	def _ascii_only(s: str) -> bool:
	try:
	s.encode("ascii"); return True
	except Exception:
	return False

	def _dupe_key(s: str) -> str:
	s = s.lower()
	s = re.sub(r"[^\\w\\s]+", " ", s)
	s = re.sub(r"\\s+", " ", s).strip()
	return s

	def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
	if "tagline" not in df.columns:
	raise ValueError("Input must contain a 'tagline' column.")
	df = df.copy()
	if "description" not in df.columns:
	df["description"] = df["tagline"]

	df["tagline"] = df["tagline"].map(_clean_text)
	df["description"] = df["description"].map(_clean_text)

	df = df[(df["tagline"].str.len() > 0)]
	mask_junk = df["tagline"].map(_has_junk) \| df["description"].map(_has_junk)
	df = df[~mask_junk]

	if KEEP_ASCII_ONLY:
	df = df[df["tagline"].map(_ascii_only)]

	df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
	df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]

	if DROP_IF_ALL_CAPS:
	df = df[~df["tagline"].map(_looks_shouty)]

	df = df[~df["tagline"].map(_contains_buzzy)]

	key = df["tagline"].map(_dupe_key)
	df = df.loc[~key.duplicated()].reset_index(drop=True)

	df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
	return df