Spaces:

j-js
/

TradingGameAI

Running

File size: 8,267 Bytes

e7c7270

from __future__ import annotations

import re
import unicodedata
from typing import Dict


SYMBOL_REPLACEMENTS: Dict[str, str] = {
    # Equality / comparison
    "=": "=",
    "≠": " !=",          # keep spaced form easy to regex
    "≈": " approx ",
    "~": " approx ",
    "≡": " equivalent ",
    "≜": " = ",
    ":=": " = ",
    ">": " > ",
    "<": " < ",
    "≥": " >= ",
    "≤": " <= ",
    "≪": " << ",
    "≫": " >> ",

    # Arithmetic operators
    "+": " + ",
    "−": " - ",
    "–": " - ",
    "—": " - ",
    "-": " - ",
    "‒": " - ",
    "±": " plus_minus ",
    "∓": " minus_plus ",
    "*": " * ",
    "×": " * ",
    "⋅": " * ",
    "·": " * ",
    "÷": " / ",
    "/": " / ",
    "∕": " / ",
    "⁄": " / ",

    # Brackets / grouping
    "[": "(",
    "]": ")",
    "{": " { ",
    "}": " } ",
    "⌊": " floor(",
    "⌋": ")",
    "⌈": " ceil(",
    "⌉": ")",

    # Powers / roots
    "^": "^",
    "²": "^2",
    "³": "^3",
    "⁴": "^4",
    "⁵": "^5",
    "⁶": "^6",
    "⁷": "^7",
    "⁸": "^8",
    "⁹": "^9",
    "⁰": "^0",
    "¹": "^1",
    "√": " sqrt ",
    "∛": " cbrt ",
    "∜": " fourth_root ",

    # Percent / rates
    "%": " percent ",
    "‰": " permille ",
    "‱": " permyriad ",

    # Geometry
    "∠": " angle ",
    "∟": " right_angle ",
    "°": " degrees ",
    "′": " prime ",
    "″": " double_prime ",
    "⊥": " perpendicular ",
    "∥": " parallel ",
    "≅": " congruent ",
    "Δ": " triangle ",
    "△": " triangle ",
    "π": " pi ",

    # Algebra / calculus-ish
    "∞": " infinity ",
    "∝": " proportional_to ",
    "∆": " delta ",
    "∑": " sum ",
    "∏": " product ",
    "∫": " integral ",

    # Probability / sets
    "∩": " intersection ",
    "∪": " union ",
    "⊆": " subseteq ",
    "⊂": " subset ",
    "∈": " in ",
    "∉": " not_in ",
    "∅": " empty_set ",
    "|": " | ",

    # Common OCR / typography junk
    "“": '"',
    "”": '"',
    "‘": "'",
    "’": "'",
    "…": "...",
    "\u00a0": " ",   # non-breaking space
}


TEXT_REPLACEMENTS: Dict[str, str] = {
    # Verbal math phrases -> more parseable forms
    "divided by": " / ",
    "multiplied by": " * ",
    "times": " * ",
    "plus": " + ",
    "minus": " - ",
    "equals": " = ",
    "is equal to": " = ",
    "is greater than or equal to": " >= ",
    "is less than or equal to": " <= ",
    "greater than or equal to": " >= ",
    "less than or equal to": " <= ",
    "greater than": " > ",
    "less than": " < ",
    "not equal to": " != ",
    "approximately equal to": " approx ",
    "approx equal to": " approx ",
    "squared": "^2",
    "cubed": "^3",
    "square root of": " sqrt ",
    "cube root of": " cbrt ",
    "to the power of": "^",
    "raised to the power of": "^",
    "percent": " percent ",
    "per cent": " percent ",
    "percentage": " percent ",
    "remainder when": " remainder ",
    "is divisible by": " divisible_by ",
    "divisible by": " divisible_by ",
    "is a multiple of": " multiple_of ",
    "multiple of": " multiple_of ",
    "factor of": " factor_of ",
    "prime number": " prime ",
    "consecutive integers": " consecutive_integers ",
    "positive integer": " positive_integer ",
    "negative integer": " negative_integer ",
    "at least": " >= ",
    "at most": " <= ",
    "no more than": " <= ",
    "no less than": " >= ",
    "more than": " > ",
    "fewer than": " < ",
    "probability of": " probability ",
    "mean": " mean ",
    "average": " average ",
    "median": " median ",
    "mode": " mode ",
    "standard deviation": " standard_deviation ",
    "variance": " variance ",
    "perimeter": " perimeter ",
    "area": " area ",
    "volume": " volume ",
    "circumference": " circumference ",
    "radius": " radius ",
    "diameter": " diameter ",
    "ratio of": " ratio ",
    "ratio": " ratio ",
    "proportion": " proportion ",
    "sum of": " sum ",
    "difference between": " difference ",
    "product of": " product ",
    "quotient of": " quotient ",
}


UNICODE_FRACTIONS: Dict[str, str] = {
    "½": "1/2",
    "⅓": "1/3",
    "⅔": "2/3",
    "¼": "1/4",
    "¾": "3/4",
    "⅕": "1/5",
    "⅖": "2/5",
    "⅗": "3/5",
    "⅘": "4/5",
    "⅙": "1/6",
    "⅚": "5/6",
    "⅐": "1/7",
    "⅛": "1/8",
    "⅜": "3/8",
    "⅝": "5/8",
    "⅞": "7/8",
    "⅑": "1/9",
    "⅒": "1/10",
}


SUPERSCRIPT_MAP: Dict[str, str] = {
    "⁰": "0",
    "¹": "1",
    "²": "2",
    "³": "3",
    "⁴": "4",
    "⁵": "5",
    "⁶": "6",
    "⁷": "7",
    "⁸": "8",
    "⁹": "9",
    "⁺": "+",
    "⁻": "-",
}

SUBSCRIPT_MAP: Dict[str, str] = {
    "₀": "0",
    "₁": "1",
    "₂": "2",
    "₃": "3",
    "₄": "4",
    "₅": "5",
    "₆": "6",
    "₇": "7",
    "₈": "8",
    "₉": "9",
    "₊": "+",
    "₋": "-",
}


def _replace_unicode_fractions(text: str) -> str:
    for k, v in UNICODE_FRACTIONS.items():
        text = text.replace(k, v)
    return text


def _replace_superscripts_and_subscripts(text: str) -> str:
    out = []
    i = 0
    while i < len(text):
        ch = text[i]

        if ch in SUPERSCRIPT_MAP:
            digits = []
            while i < len(text) and text[i] in SUPERSCRIPT_MAP:
                digits.append(SUPERSCRIPT_MAP[text[i]])
                i += 1
            out.append("^" + "".join(digits))
            continue

        if ch in SUBSCRIPT_MAP:
            digits = []
            while i < len(text) and text[i] in SUBSCRIPT_MAP:
                digits.append(SUBSCRIPT_MAP[text[i]])
                i += 1
            out.append("_" + "".join(digits))
            continue

        out.append(ch)
        i += 1

    return "".join(out)


def _replace_symbol_chars(text: str) -> str:
    for k, v in SYMBOL_REPLACEMENTS.items():
        text = text.replace(k, v)
    return text


def _replace_text_phrases(text: str) -> str:
    # longest first so "greater than or equal to" is replaced before "greater than"
    for k in sorted(TEXT_REPLACEMENTS.keys(), key=len, reverse=True):
        text = re.sub(rf"\b{re.escape(k)}\b", TEXT_REPLACEMENTS[k], text, flags=re.I)
    return text


def _normalize_roots(text: str) -> str:
    # "sqrt 9" -> "sqrt(9)"
    text = re.sub(r"\bsqrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"sqrt(\1)", text, flags=re.I)
    text = re.sub(r"\bcbrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"cbrt(\1)", text, flags=re.I)
    return text


def _normalize_percent_expressions(text: str) -> str:
    # "25 percent of 80" -> "(25/100) * 80"
    text = re.sub(
        r"(\d+(?:\.\d+)?)\s*percent\s+of\s+(\d+(?:\.\d+)?)",
        r"(\1/100) * \2",
        text,
        flags=re.I,
    )

    # "x percent" -> "(x/100)"
    text = re.sub(
        r"(\d+(?:\.\d+)?)\s*percent\b",
        r"(\1/100)",
        text,
        flags=re.I,
    )

    # per-mille
    text = re.sub(
        r"(\d+(?:\.\d+)?)\s*permille\b",
        r"(\1/1000)",
        text,
        flags=re.I,
    )
    return text


def _normalize_multiplication_spacing(text: str) -> str:
    # 5x -> 5*x
    text = re.sub(r"(\d)([a-zA-Z])", r"\1*\2", text)
    # )x -> )*x
    text = re.sub(r"(\))([a-zA-Z0-9])", r"\1*\2", text)
    # x( -> x*(
    text = re.sub(r"([a-zA-Z0-9])(\()", r"\1*\2", text)
    return text


def normalize_math_text(text: str) -> str:
    if not text:
        return ""

    text = unicodedata.normalize("NFKC", text)
    text = _replace_unicode_fractions(text)
    text = _replace_superscripts_and_subscripts(text)
    text = _replace_symbol_chars(text)
    text = _replace_text_phrases(text)
    text = _normalize_roots(text)
    text = _normalize_percent_expressions(text)
    text = _normalize_multiplication_spacing(text)

    # normalize repeated spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


def normalize_for_solver(text: str) -> str:
    text = normalize_math_text(text)

    # make some solver-oriented aliases
    text = text.replace("pi", "3.141592653589793")
    text = text.replace("approx", "~")

    return text


def normalize_for_parser(text: str) -> str:
    text = normalize_math_text(text)

    # keep semantic tokens for router/parser
    return text