TradingGameAI / math_normalizer.py
j-js's picture
Create math_normalizer.py
e7c7270 verified
from __future__ import annotations
import re
import unicodedata
from typing import Dict
SYMBOL_REPLACEMENTS: Dict[str, str] = {
# Equality / comparison
"=": "=",
"β‰ ": " !=", # keep spaced form easy to regex
"β‰ˆ": " approx ",
"~": " approx ",
"≑": " equivalent ",
"β‰œ": " = ",
":=": " = ",
">": " > ",
"<": " < ",
"β‰₯": " >= ",
"≀": " <= ",
"β‰ͺ": " << ",
"≫": " >> ",
# Arithmetic operators
"+": " + ",
"βˆ’": " - ",
"–": " - ",
"β€”": " - ",
"-": " - ",
"β€’": " - ",
"Β±": " plus_minus ",
"βˆ“": " minus_plus ",
"*": " * ",
"Γ—": " * ",
"β‹…": " * ",
"Β·": " * ",
"Γ·": " / ",
"/": " / ",
"βˆ•": " / ",
"⁄": " / ",
# Brackets / grouping
"[": "(",
"]": ")",
"{": " { ",
"}": " } ",
"⌊": " floor(",
"βŒ‹": ")",
"⌈": " ceil(",
"βŒ‰": ")",
# Powers / roots
"^": "^",
"Β²": "^2",
"Β³": "^3",
"⁴": "^4",
"⁡": "^5",
"⁢": "^6",
"⁷": "^7",
"⁸": "^8",
"⁹": "^9",
"⁰": "^0",
"ΒΉ": "^1",
"√": " sqrt ",
"βˆ›": " cbrt ",
"∜": " fourth_root ",
# Percent / rates
"%": " percent ",
"‰": " permille ",
"β€±": " permyriad ",
# Geometry
"∠": " angle ",
"∟": " right_angle ",
"Β°": " degrees ",
"β€²": " prime ",
"β€³": " double_prime ",
"βŠ₯": " perpendicular ",
"βˆ₯": " parallel ",
"β‰…": " congruent ",
"Ξ”": " triangle ",
"β–³": " triangle ",
"Ο€": " pi ",
# Algebra / calculus-ish
"∞": " infinity ",
"∝": " proportional_to ",
"βˆ†": " delta ",
"βˆ‘": " sum ",
"∏": " product ",
"∫": " integral ",
# Probability / sets
"∩": " intersection ",
"βˆͺ": " union ",
"βŠ†": " subseteq ",
"βŠ‚": " subset ",
"∈": " in ",
"βˆ‰": " not_in ",
"βˆ…": " empty_set ",
"|": " | ",
# Common OCR / typography junk
"β€œ": '"',
"”": '"',
"β€˜": "'",
"’": "'",
"…": "...",
"\u00a0": " ", # non-breaking space
}
TEXT_REPLACEMENTS: Dict[str, str] = {
# Verbal math phrases -> more parseable forms
"divided by": " / ",
"multiplied by": " * ",
"times": " * ",
"plus": " + ",
"minus": " - ",
"equals": " = ",
"is equal to": " = ",
"is greater than or equal to": " >= ",
"is less than or equal to": " <= ",
"greater than or equal to": " >= ",
"less than or equal to": " <= ",
"greater than": " > ",
"less than": " < ",
"not equal to": " != ",
"approximately equal to": " approx ",
"approx equal to": " approx ",
"squared": "^2",
"cubed": "^3",
"square root of": " sqrt ",
"cube root of": " cbrt ",
"to the power of": "^",
"raised to the power of": "^",
"percent": " percent ",
"per cent": " percent ",
"percentage": " percent ",
"remainder when": " remainder ",
"is divisible by": " divisible_by ",
"divisible by": " divisible_by ",
"is a multiple of": " multiple_of ",
"multiple of": " multiple_of ",
"factor of": " factor_of ",
"prime number": " prime ",
"consecutive integers": " consecutive_integers ",
"positive integer": " positive_integer ",
"negative integer": " negative_integer ",
"at least": " >= ",
"at most": " <= ",
"no more than": " <= ",
"no less than": " >= ",
"more than": " > ",
"fewer than": " < ",
"probability of": " probability ",
"mean": " mean ",
"average": " average ",
"median": " median ",
"mode": " mode ",
"standard deviation": " standard_deviation ",
"variance": " variance ",
"perimeter": " perimeter ",
"area": " area ",
"volume": " volume ",
"circumference": " circumference ",
"radius": " radius ",
"diameter": " diameter ",
"ratio of": " ratio ",
"ratio": " ratio ",
"proportion": " proportion ",
"sum of": " sum ",
"difference between": " difference ",
"product of": " product ",
"quotient of": " quotient ",
}
UNICODE_FRACTIONS: Dict[str, str] = {
"Β½": "1/2",
"β…“": "1/3",
"β…”": "2/3",
"ΒΌ": "1/4",
"ΒΎ": "3/4",
"β…•": "1/5",
"β…–": "2/5",
"β…—": "3/5",
"β…˜": "4/5",
"β…™": "1/6",
"β…š": "5/6",
"⅐": "1/7",
"β…›": "1/8",
"β…œ": "3/8",
"⅝": "5/8",
"β…ž": "7/8",
"β…‘": "1/9",
"β…’": "1/10",
}
SUPERSCRIPT_MAP: Dict[str, str] = {
"⁰": "0",
"ΒΉ": "1",
"Β²": "2",
"Β³": "3",
"⁴": "4",
"⁡": "5",
"⁢": "6",
"⁷": "7",
"⁸": "8",
"⁹": "9",
"⁺": "+",
"⁻": "-",
}
SUBSCRIPT_MAP: Dict[str, str] = {
"β‚€": "0",
"₁": "1",
"β‚‚": "2",
"₃": "3",
"β‚„": "4",
"β‚…": "5",
"₆": "6",
"₇": "7",
"β‚ˆ": "8",
"₉": "9",
"β‚Š": "+",
"β‚‹": "-",
}
def _replace_unicode_fractions(text: str) -> str:
for k, v in UNICODE_FRACTIONS.items():
text = text.replace(k, v)
return text
def _replace_superscripts_and_subscripts(text: str) -> str:
out = []
i = 0
while i < len(text):
ch = text[i]
if ch in SUPERSCRIPT_MAP:
digits = []
while i < len(text) and text[i] in SUPERSCRIPT_MAP:
digits.append(SUPERSCRIPT_MAP[text[i]])
i += 1
out.append("^" + "".join(digits))
continue
if ch in SUBSCRIPT_MAP:
digits = []
while i < len(text) and text[i] in SUBSCRIPT_MAP:
digits.append(SUBSCRIPT_MAP[text[i]])
i += 1
out.append("_" + "".join(digits))
continue
out.append(ch)
i += 1
return "".join(out)
def _replace_symbol_chars(text: str) -> str:
for k, v in SYMBOL_REPLACEMENTS.items():
text = text.replace(k, v)
return text
def _replace_text_phrases(text: str) -> str:
# longest first so "greater than or equal to" is replaced before "greater than"
for k in sorted(TEXT_REPLACEMENTS.keys(), key=len, reverse=True):
text = re.sub(rf"\b{re.escape(k)}\b", TEXT_REPLACEMENTS[k], text, flags=re.I)
return text
def _normalize_roots(text: str) -> str:
# "sqrt 9" -> "sqrt(9)"
text = re.sub(r"\bsqrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"sqrt(\1)", text, flags=re.I)
text = re.sub(r"\bcbrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"cbrt(\1)", text, flags=re.I)
return text
def _normalize_percent_expressions(text: str) -> str:
# "25 percent of 80" -> "(25/100) * 80"
text = re.sub(
r"(\d+(?:\.\d+)?)\s*percent\s+of\s+(\d+(?:\.\d+)?)",
r"(\1/100) * \2",
text,
flags=re.I,
)
# "x percent" -> "(x/100)"
text = re.sub(
r"(\d+(?:\.\d+)?)\s*percent\b",
r"(\1/100)",
text,
flags=re.I,
)
# per-mille
text = re.sub(
r"(\d+(?:\.\d+)?)\s*permille\b",
r"(\1/1000)",
text,
flags=re.I,
)
return text
def _normalize_multiplication_spacing(text: str) -> str:
# 5x -> 5*x
text = re.sub(r"(\d)([a-zA-Z])", r"\1*\2", text)
# )x -> )*x
text = re.sub(r"(\))([a-zA-Z0-9])", r"\1*\2", text)
# x( -> x*(
text = re.sub(r"([a-zA-Z0-9])(\()", r"\1*\2", text)
return text
def normalize_math_text(text: str) -> str:
if not text:
return ""
text = unicodedata.normalize("NFKC", text)
text = _replace_unicode_fractions(text)
text = _replace_superscripts_and_subscripts(text)
text = _replace_symbol_chars(text)
text = _replace_text_phrases(text)
text = _normalize_roots(text)
text = _normalize_percent_expressions(text)
text = _normalize_multiplication_spacing(text)
# normalize repeated spaces
text = re.sub(r"\s+", " ", text).strip()
return text
def normalize_for_solver(text: str) -> str:
text = normalize_math_text(text)
# make some solver-oriented aliases
text = text.replace("pi", "3.141592653589793")
text = text.replace("approx", "~")
return text
def normalize_for_parser(text: str) -> str:
text = normalize_math_text(text)
# keep semantic tokens for router/parser
return text