Spaces:

apps1
/

finbertteacher_v1

Sleeping

File size: 7,642 Bytes

89c54bf

"""SMS/text preprocessing for the BERT classifier (stdlib only)."""

from __future__ import annotations

import re
import unicodedata
from typing import Optional

_CHAR_REPLACEMENTS: dict[str, str] = {}


def _add(chars: str, replacement: str) -> None:
    for ch in chars:
        _CHAR_REPLACEMENTS[ch] = replacement


_add("—–‐‑‒―−", "-")
_add("„‟«»\u201c\u201d", '"')
_CHAR_REPLACEMENTS["\u201c"] = '"'
_CHAR_REPLACEMENTS["\u201d"] = '"'
_add("‚‛′‵\u2018\u2019", "'")
_CHAR_REPLACEMENTS["\u2018"] = "'"
_CHAR_REPLACEMENTS["\u2019"] = "'"
_CHAR_REPLACEMENTS["…"] = "..."
_CHAR_REPLACEMENTS["‥"] = ".."
_CHAR_REPLACEMENTS["․"] = "."
_add("•◦●○▪▫∙⁌⁍⁃", "*")
_add("\u00a0\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u202f\u205f\u3000", " ")
_CHAR_REPLACEMENTS["\u200b"] = ""
_CHAR_REPLACEMENTS["\u200c"] = ""
_CHAR_REPLACEMENTS["\u200d"] = ""
_CHAR_REPLACEMENTS["×"] = "*"
_CHAR_REPLACEMENTS["÷"] = "/"
_CHAR_REPLACEMENTS["±"] = "+/-"
_CHAR_REPLACEMENTS["≈"] = "~"
_CHAR_REPLACEMENTS["≠"] = "!="
_CHAR_REPLACEMENTS["≤"] = "<="
_CHAR_REPLACEMENTS["≥"] = ">="
_CHAR_REPLACEMENTS["∞"] = "infinite"
_CHAR_REPLACEMENTS["→"] = "->"
_CHAR_REPLACEMENTS["←"] = "<-"
_CHAR_REPLACEMENTS["↑"] = "^"
_CHAR_REPLACEMENTS["↓"] = "v"
_CHAR_REPLACEMENTS["↔"] = "<->"
_CHAR_REPLACEMENTS["©"] = "(c)"
_CHAR_REPLACEMENTS["®"] = "(R)"
_CHAR_REPLACEMENTS["™"] = "TM"
_CHAR_REPLACEMENTS["°"] = "degree"
_CHAR_REPLACEMENTS["‰"] = "0/00"
_CHAR_REPLACEMENTS["‱"] = "0/000"
_CHAR_REPLACEMENTS["†"] = "+"
_CHAR_REPLACEMENTS["‡"] = "++"
_CHAR_REPLACEMENTS["§"] = "S"
_CHAR_REPLACEMENTS["¶"] = "P"
_CHAR_REPLACEMENTS["‹"] = "<"
_CHAR_REPLACEMENTS["›"] = ">"
_CHAR_REPLACEMENTS.update(
    {
        "½": "1/2",
        "¼": "1/4",
        "¾": "3/4",
        "⅓": "1/3",
        "⅔": "2/3",
        "⅕": "1/5",
        "⅖": "2/5",
        "⅗": "3/5",
        "⅘": "4/5",
        "⅙": "1/6",
        "⅚": "5/6",
        "⅛": "1/8",
        "⅜": "3/8",
        "⅝": "5/8",
        "⅞": "7/8",
        "⅐": "1/7",
        "⅑": "1/9",
        "⅒": "1/10",
    }
)
_CHAR_REPLACEMENTS.update(
    {
        "¹": "^1",
        "²": "^2",
        "³": "^3",
        "⁴": "^4",
        "⁵": "^5",
        "⁶": "^6",
        "⁷": "^7",
        "⁸": "^8",
        "⁹": "^9",
        "⁰": "^0",
        "⁺": "^+",
        "⁻": "^-",
        "⁼": "^=",
        "⁽": "^(",
        "⁾": "^)",
    }
)
_CHAR_REPLACEMENTS.update(
    {
        "₁": "_1",
        "₂": "_2",
        "₃": "_3",
        "₄": "_4",
        "₅": "_5",
        "₆": "_6",
        "₇": "_7",
        "₈": "_8",
        "₉": "_9",
        "₀": "_0",
        "₊": "_+",
        "₋": "_-",
        "₌": "_=",
        "₍": "_(",
        "₎": "_)",
    }
)
for _i in range(10):
    _CHAR_REPLACEMENTS[chr(0xFF10 + _i)] = str(_i)
for _i in range(26):
    _CHAR_REPLACEMENTS[chr(0xFF21 + _i)] = chr(ord("A") + _i)
    _CHAR_REPLACEMENTS[chr(0xFF41 + _i)] = chr(ord("a") + _i)
_CHAR_REPLACEMENTS.update(
    {
        "！": "!",
        "＂": '"',
        "＃": "#",
        "＄": "$",
        "％": "%",
        "＆": "&",
        "＇": "'",
        "（": "(",
        "）": ")",
        "＊": "*",
        "＋": "+",
        "，": ",",
        "－": "-",
        "．": ".",
        "／": "/",
        "：": ":",
        "；": ";",
        "＜": "<",
        "＝": "=",
        "＞": ">",
        "？": "?",
        "＠": "@",
        "［": "[",
        "＼": "\\",
        "］": "]",
        "＾": "^",
        "＿": "_",
        "｀": "`",
        "｛": "{",
        "｜": "|",
        "｝": "}",
        "～": "~",
    }
)
for _i in range(12):
    n = str(_i + 1)
    _CHAR_REPLACEMENTS[chr(0x2160 + _i)] = n
    _CHAR_REPLACEMENTS[chr(0x2170 + _i)] = n
_CHAR_REPLACEMENTS["Ⅼ"] = "50"
_CHAR_REPLACEMENTS["Ⅽ"] = "100"
_CHAR_REPLACEMENTS["Ⅾ"] = "500"
_CHAR_REPLACEMENTS["Ⅿ"] = "1000"
_CHAR_REPLACEMENTS["ⅼ"] = "50"
_CHAR_REPLACEMENTS["ⅽ"] = "100"
_CHAR_REPLACEMENTS["ⅾ"] = "500"
_CHAR_REPLACEMENTS["ⅿ"] = "1000"
for _i in range(26):
    _CHAR_REPLACEMENTS[chr(0x1D00C + _i)] = chr(ord("a") + _i)

_DESLOPIFY_TABLE = str.maketrans(_CHAR_REPLACEMENTS)

STATIC_EMAIL = "a@b.com"
STATIC_URL = "www.a.com"
_EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
_URL_RE = re.compile(r"https?://[^\s]+|www\.[^\s]+", flags=re.IGNORECASE)
ORG_CANDIDATE_SEP = " [SEP] "

_CURRENCY_ISO_CODES: tuple[str, ...] = (
    "AED",
    "BDT",
    "CNY",
    "EUR",
    "GBP",
    "INR",
    "JPY",
    "KRW",
    "LKR",
    "NPR",
    "PKR",
    "SAR",
    "USD",
)
_CURRENCY_CODE_ALT = "|".join(sorted(_CURRENCY_ISO_CODES, key=len, reverse=True))
_CURRENCY_CODE_RE = re.compile(rf"(?i)(?:{_CURRENCY_CODE_ALT})(?![A-Za-z])")
_CURRENCY_RUPEE_WORDS_RE = re.compile(r"(?i)(?<![A-Za-z])(?:rupees|rupee)(?![A-Za-z])")
_CURRENCY_RS_RE = re.compile(r"(?i)(?<![A-Za-z])rs(?![A-Za-z])")
_CURRENCY_SYM_RE = re.compile(r"[€£¢¥₹₽₿₩](?![A-Za-z])")
_CURRENCY_DUP_DOLLAR_RE = re.compile(r"\${2,}")


def normalize_currency_markers(text: str) -> str:
    """Map common ISO codes and currency symbols to '$' when not followed by [A-Za-z]."""
    if not text:
        return text
    text = _CURRENCY_CODE_RE.sub("$", text)
    text = _CURRENCY_RUPEE_WORDS_RE.sub("$", text)
    text = _CURRENCY_RS_RE.sub("$", text)
    text = _CURRENCY_SYM_RE.sub("$", text)
    text = _CURRENCY_DUP_DOLLAR_RE.sub("$", text)
    return text


def deslopify(text: str) -> str:
    if not text:
        return text
    return text.translate(_DESLOPIFY_TABLE)


def filter_ascii(text: str) -> str:
    if not text:
        return text
    return "".join(ch for ch in text if (32 <= ord(ch) <= 126) or ch in "\t\n\f\r")


def normalize_terminal_punctuation(text: str) -> str:
    if not text:
        return ""
    stripped = text.strip()
    if not stripped:
        return ""
    last = stripped[-1]
    if unicodedata.category(last)[0] != "P":
        return f"{stripped}."
    return stripped


def digits_to_ones(text: str) -> str:
    return "".join("1" if c.isdigit() else c for c in text)


def mask_emails(text: str) -> str:
    return _EMAIL_RE.sub(STATIC_EMAIL, text)


def mask_urls(text: str) -> str:
    return _URL_RE.sub(STATIC_URL, text)


def normalize_freeform_text(text: str) -> str:
    text = mask_emails(text)
    text = mask_urls(text)
    return digits_to_ones(text)


def preprocess_text(raw_text: str) -> str:
    if not raw_text:
        return ""
    text = deslopify(raw_text)
    text = normalize_currency_markers(text)
    text = filter_ascii(text)
    text = normalize_terminal_punctuation(text)
    return normalize_freeform_text(text)


def preprocess_batch(raw_texts: list[str]) -> list[str]:
    return [preprocess_text(t) for t in raw_texts]


def preprocess_for_model(
    raw_text: str,
    org_candidates: Optional[list[str]] = None,
    *,
    org_sep: str = ORG_CANDIDATE_SEP,
    tokenizer_sep: str = " [SEP] ",
) -> str:
    text = preprocess_text(raw_text)
    if not org_candidates:
        return text
    return f"{org_sep.join(org_candidates)}{tokenizer_sep}{text}"


if __name__ == "__main__":
    _samples = (
        "Rs 100",
        "RS.500",
        "inr100",
        "Rs1",
        "Rsuper",
        "INRing",
        "₹500",
        "₩1000",
        "99 Rs",
        "50 rupees",
        "pay 1 rupee",
    )
    for s in _samples:
        out = preprocess_text(s)
        print(ascii(s), "->", ascii(out))