| \ |
| import pandas as pd |
| import re, unicodedata |
| from html import unescape |
|
|
| MIN_LEN = 20 |
| MAX_LEN = 60 |
| KEEP_ASCII_ONLY = False |
| MIN_ALPHA_RATIO = 0.60 |
| DROP_IF_ALL_CAPS = False |
|
|
| BUZZY = { |
| "synergy","cutting edge","cutting-edge","best in class","best-in-class", |
| "world class","world-class","state of the art","state-of-the-art", |
| "revolutionary","disruptive platform","next generation","next-gen", |
| "leading provider","scalable solution" |
| } |
|
|
| URL_RE = re.compile(r"(https?://|www\.)\S+", re.I) |
| EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I) |
| PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)") |
| WS_RE = re.compile(r"\s+") |
| PUNCT_RE = re.compile(r"[^\w\s]+") |
| TM_RE = re.compile(r"[®️©️™️]") |
|
|
| def _nfkc(s): return unicodedata.normalize("NFKC", s) |
|
|
| def _clean_text(s: str) -> str: |
| s = "" if s is None else str(s) |
| s = unescape(s) |
| s = _nfkc(s) |
| s = s.replace("\\n"," ").replace("\\r"," ") |
| s = TM_RE.sub("", s) |
| s = WS_RE.sub(" ", s).strip() |
| return s |
|
|
| def _alpha_ratio(s: str) -> float: |
| if not s: return 0.0 |
| letters = sum(ch.isalpha() for ch in s) |
| return letters / max(1, len(s)) |
|
|
| def _looks_shouty(s: str) -> bool: |
| letters = [ch for ch in s if ch.isalpha()] |
| if not letters: return False |
| uppers = sum(ch.isupper() for ch in letters) |
| return uppers / len(letters) >= 0.85 |
|
|
| def _contains_buzzy(s: str) -> bool: |
| lo = s.lower() |
| return any(term in lo for term in BUZZY) |
|
|
| def _has_junk(s: str) -> bool: |
| return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s)) |
|
|
| def _ascii_only(s: str) -> bool: |
| try: |
| s.encode("ascii"); return True |
| except Exception: |
| return False |
|
|
| def _dupe_key(s: str) -> str: |
| s = s.lower() |
| s = re.sub(r"[^\\w\\s]+", " ", s) |
| s = re.sub(r"\\s+", " ", s).strip() |
| return s |
|
|
| def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame: |
| if "tagline" not in df.columns: |
| raise ValueError("Input must contain a 'tagline' column.") |
| df = df.copy() |
| if "description" not in df.columns: |
| df["description"] = df["tagline"] |
|
|
| df["tagline"] = df["tagline"].map(_clean_text) |
| df["description"] = df["description"].map(_clean_text) |
|
|
| df = df[(df["tagline"].str.len() > 0)] |
| mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk) |
| df = df[~mask_junk] |
|
|
| if KEEP_ASCII_ONLY: |
| df = df[df["tagline"].map(_ascii_only)] |
|
|
| df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO] |
| df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)] |
|
|
| if DROP_IF_ALL_CAPS: |
| df = df[~df["tagline"].map(_looks_shouty)] |
|
|
| df = df[~df["tagline"].map(_contains_buzzy)] |
|
|
| key = df["tagline"].map(_dupe_key) |
| df = df.loc[~key.duplicated()].reset_index(drop=True) |
|
|
| df.loc[df["description"].str.len() == 0, "description"] = df["tagline"] |
| return df |
|
|