from __future__ import annotations import re import unicodedata from typing import Dict SYMBOL_REPLACEMENTS: Dict[str, str] = { # Equality / comparison "=": "=", "≠": " !=", # keep spaced form easy to regex "≈": " approx ", "~": " approx ", "≡": " equivalent ", "≜": " = ", ":=": " = ", ">": " > ", "<": " < ", "≥": " >= ", "≤": " <= ", "≪": " << ", "≫": " >> ", # Arithmetic operators "+": " + ", "−": " - ", "–": " - ", "—": " - ", "-": " - ", "‒": " - ", "±": " plus_minus ", "∓": " minus_plus ", "*": " * ", "×": " * ", "⋅": " * ", "·": " * ", "÷": " / ", "/": " / ", "∕": " / ", "⁄": " / ", # Brackets / grouping "[": "(", "]": ")", "{": " { ", "}": " } ", "⌊": " floor(", "⌋": ")", "⌈": " ceil(", "⌉": ")", # Powers / roots "^": "^", "²": "^2", "³": "^3", "⁴": "^4", "⁵": "^5", "⁶": "^6", "⁷": "^7", "⁸": "^8", "⁹": "^9", "⁰": "^0", "¹": "^1", "√": " sqrt ", "∛": " cbrt ", "∜": " fourth_root ", # Percent / rates "%": " percent ", "‰": " permille ", "‱": " permyriad ", # Geometry "∠": " angle ", "∟": " right_angle ", "°": " degrees ", "′": " prime ", "″": " double_prime ", "⊥": " perpendicular ", "∥": " parallel ", "≅": " congruent ", "Δ": " triangle ", "△": " triangle ", "π": " pi ", # Algebra / calculus-ish "∞": " infinity ", "∝": " proportional_to ", "∆": " delta ", "∑": " sum ", "∏": " product ", "∫": " integral ", # Probability / sets "∩": " intersection ", "∪": " union ", "⊆": " subseteq ", "⊂": " subset ", "∈": " in ", "∉": " not_in ", "∅": " empty_set ", "|": " | ", # Common OCR / typography junk "“": '"', "”": '"', "‘": "'", "’": "'", "…": "...", "\u00a0": " ", # non-breaking space } TEXT_REPLACEMENTS: Dict[str, str] = { # Verbal math phrases -> more parseable forms "divided by": " / ", "multiplied by": " * ", "times": " * ", "plus": " + ", "minus": " - ", "equals": " = ", "is equal to": " = ", "is greater than or equal to": " >= ", "is less than or equal to": " <= ", "greater than or equal to": " >= ", "less than or equal to": " <= ", "greater than": " > ", "less than": " < ", "not equal to": " != ", "approximately equal to": " approx ", "approx equal to": " approx ", "squared": "^2", "cubed": "^3", "square root of": " sqrt ", "cube root of": " cbrt ", "to the power of": "^", "raised to the power of": "^", "percent": " percent ", "per cent": " percent ", "percentage": " percent ", "remainder when": " remainder ", "is divisible by": " divisible_by ", "divisible by": " divisible_by ", "is a multiple of": " multiple_of ", "multiple of": " multiple_of ", "factor of": " factor_of ", "prime number": " prime ", "consecutive integers": " consecutive_integers ", "positive integer": " positive_integer ", "negative integer": " negative_integer ", "at least": " >= ", "at most": " <= ", "no more than": " <= ", "no less than": " >= ", "more than": " > ", "fewer than": " < ", "probability of": " probability ", "mean": " mean ", "average": " average ", "median": " median ", "mode": " mode ", "standard deviation": " standard_deviation ", "variance": " variance ", "perimeter": " perimeter ", "area": " area ", "volume": " volume ", "circumference": " circumference ", "radius": " radius ", "diameter": " diameter ", "ratio of": " ratio ", "ratio": " ratio ", "proportion": " proportion ", "sum of": " sum ", "difference between": " difference ", "product of": " product ", "quotient of": " quotient ", } UNICODE_FRACTIONS: Dict[str, str] = { "½": "1/2", "⅓": "1/3", "⅔": "2/3", "¼": "1/4", "¾": "3/4", "⅕": "1/5", "⅖": "2/5", "⅗": "3/5", "⅘": "4/5", "⅙": "1/6", "⅚": "5/6", "⅐": "1/7", "⅛": "1/8", "⅜": "3/8", "⅝": "5/8", "⅞": "7/8", "⅑": "1/9", "⅒": "1/10", } SUPERSCRIPT_MAP: Dict[str, str] = { "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9", "⁺": "+", "⁻": "-", } SUBSCRIPT_MAP: Dict[str, str] = { "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9", "₊": "+", "₋": "-", } def _replace_unicode_fractions(text: str) -> str: for k, v in UNICODE_FRACTIONS.items(): text = text.replace(k, v) return text def _replace_superscripts_and_subscripts(text: str) -> str: out = [] i = 0 while i < len(text): ch = text[i] if ch in SUPERSCRIPT_MAP: digits = [] while i < len(text) and text[i] in SUPERSCRIPT_MAP: digits.append(SUPERSCRIPT_MAP[text[i]]) i += 1 out.append("^" + "".join(digits)) continue if ch in SUBSCRIPT_MAP: digits = [] while i < len(text) and text[i] in SUBSCRIPT_MAP: digits.append(SUBSCRIPT_MAP[text[i]]) i += 1 out.append("_" + "".join(digits)) continue out.append(ch) i += 1 return "".join(out) def _replace_symbol_chars(text: str) -> str: for k, v in SYMBOL_REPLACEMENTS.items(): text = text.replace(k, v) return text def _replace_text_phrases(text: str) -> str: # longest first so "greater than or equal to" is replaced before "greater than" for k in sorted(TEXT_REPLACEMENTS.keys(), key=len, reverse=True): text = re.sub(rf"\b{re.escape(k)}\b", TEXT_REPLACEMENTS[k], text, flags=re.I) return text def _normalize_roots(text: str) -> str: # "sqrt 9" -> "sqrt(9)" text = re.sub(r"\bsqrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"sqrt(\1)", text, flags=re.I) text = re.sub(r"\bcbrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"cbrt(\1)", text, flags=re.I) return text def _normalize_percent_expressions(text: str) -> str: # "25 percent of 80" -> "(25/100) * 80" text = re.sub( r"(\d+(?:\.\d+)?)\s*percent\s+of\s+(\d+(?:\.\d+)?)", r"(\1/100) * \2", text, flags=re.I, ) # "x percent" -> "(x/100)" text = re.sub( r"(\d+(?:\.\d+)?)\s*percent\b", r"(\1/100)", text, flags=re.I, ) # per-mille text = re.sub( r"(\d+(?:\.\d+)?)\s*permille\b", r"(\1/1000)", text, flags=re.I, ) return text def _normalize_multiplication_spacing(text: str) -> str: # 5x -> 5*x text = re.sub(r"(\d)([a-zA-Z])", r"\1*\2", text) # )x -> )*x text = re.sub(r"(\))([a-zA-Z0-9])", r"\1*\2", text) # x( -> x*( text = re.sub(r"([a-zA-Z0-9])(\()", r"\1*\2", text) return text def normalize_math_text(text: str) -> str: if not text: return "" text = unicodedata.normalize("NFKC", text) text = _replace_unicode_fractions(text) text = _replace_superscripts_and_subscripts(text) text = _replace_symbol_chars(text) text = _replace_text_phrases(text) text = _normalize_roots(text) text = _normalize_percent_expressions(text) text = _normalize_multiplication_spacing(text) # normalize repeated spaces text = re.sub(r"\s+", " ", text).strip() return text def normalize_for_solver(text: str) -> str: text = normalize_math_text(text) # make some solver-oriented aliases text = text.replace("pi", "3.141592653589793") text = text.replace("approx", "~") return text def normalize_for_parser(text: str) -> str: text = normalize_math_text(text) # keep semantic tokens for router/parser return text