Spaces:
Running
Running
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| from typing import Dict | |
| SYMBOL_REPLACEMENTS: Dict[str, str] = { | |
| # Equality / comparison | |
| "=": "=", | |
| "β ": " !=", # keep spaced form easy to regex | |
| "β": " approx ", | |
| "~": " approx ", | |
| "β‘": " equivalent ", | |
| "β": " = ", | |
| ":=": " = ", | |
| ">": " > ", | |
| "<": " < ", | |
| "β₯": " >= ", | |
| "β€": " <= ", | |
| "βͺ": " << ", | |
| "β«": " >> ", | |
| # Arithmetic operators | |
| "+": " + ", | |
| "β": " - ", | |
| "β": " - ", | |
| "β": " - ", | |
| "-": " - ", | |
| "β": " - ", | |
| "Β±": " plus_minus ", | |
| "β": " minus_plus ", | |
| "*": " * ", | |
| "Γ": " * ", | |
| "β ": " * ", | |
| "Β·": " * ", | |
| "Γ·": " / ", | |
| "/": " / ", | |
| "β": " / ", | |
| "β": " / ", | |
| # Brackets / grouping | |
| "[": "(", | |
| "]": ")", | |
| "{": " { ", | |
| "}": " } ", | |
| "β": " floor(", | |
| "β": ")", | |
| "β": " ceil(", | |
| "β": ")", | |
| # Powers / roots | |
| "^": "^", | |
| "Β²": "^2", | |
| "Β³": "^3", | |
| "β΄": "^4", | |
| "β΅": "^5", | |
| "βΆ": "^6", | |
| "β·": "^7", | |
| "βΈ": "^8", | |
| "βΉ": "^9", | |
| "β°": "^0", | |
| "ΒΉ": "^1", | |
| "β": " sqrt ", | |
| "β": " cbrt ", | |
| "β": " fourth_root ", | |
| # Percent / rates | |
| "%": " percent ", | |
| "β°": " permille ", | |
| "β±": " permyriad ", | |
| # Geometry | |
| "β ": " angle ", | |
| "β": " right_angle ", | |
| "Β°": " degrees ", | |
| "β²": " prime ", | |
| "β³": " double_prime ", | |
| "β₯": " perpendicular ", | |
| "β₯": " parallel ", | |
| "β ": " congruent ", | |
| "Ξ": " triangle ", | |
| "β³": " triangle ", | |
| "Ο": " pi ", | |
| # Algebra / calculus-ish | |
| "β": " infinity ", | |
| "β": " proportional_to ", | |
| "β": " delta ", | |
| "β": " sum ", | |
| "β": " product ", | |
| "β«": " integral ", | |
| # Probability / sets | |
| "β©": " intersection ", | |
| "βͺ": " union ", | |
| "β": " subseteq ", | |
| "β": " subset ", | |
| "β": " in ", | |
| "β": " not_in ", | |
| "β ": " empty_set ", | |
| "|": " | ", | |
| # Common OCR / typography junk | |
| "β": '"', | |
| "β": '"', | |
| "β": "'", | |
| "β": "'", | |
| "β¦": "...", | |
| "\u00a0": " ", # non-breaking space | |
| } | |
| TEXT_REPLACEMENTS: Dict[str, str] = { | |
| # Verbal math phrases -> more parseable forms | |
| "divided by": " / ", | |
| "multiplied by": " * ", | |
| "times": " * ", | |
| "plus": " + ", | |
| "minus": " - ", | |
| "equals": " = ", | |
| "is equal to": " = ", | |
| "is greater than or equal to": " >= ", | |
| "is less than or equal to": " <= ", | |
| "greater than or equal to": " >= ", | |
| "less than or equal to": " <= ", | |
| "greater than": " > ", | |
| "less than": " < ", | |
| "not equal to": " != ", | |
| "approximately equal to": " approx ", | |
| "approx equal to": " approx ", | |
| "squared": "^2", | |
| "cubed": "^3", | |
| "square root of": " sqrt ", | |
| "cube root of": " cbrt ", | |
| "to the power of": "^", | |
| "raised to the power of": "^", | |
| "percent": " percent ", | |
| "per cent": " percent ", | |
| "percentage": " percent ", | |
| "remainder when": " remainder ", | |
| "is divisible by": " divisible_by ", | |
| "divisible by": " divisible_by ", | |
| "is a multiple of": " multiple_of ", | |
| "multiple of": " multiple_of ", | |
| "factor of": " factor_of ", | |
| "prime number": " prime ", | |
| "consecutive integers": " consecutive_integers ", | |
| "positive integer": " positive_integer ", | |
| "negative integer": " negative_integer ", | |
| "at least": " >= ", | |
| "at most": " <= ", | |
| "no more than": " <= ", | |
| "no less than": " >= ", | |
| "more than": " > ", | |
| "fewer than": " < ", | |
| "probability of": " probability ", | |
| "mean": " mean ", | |
| "average": " average ", | |
| "median": " median ", | |
| "mode": " mode ", | |
| "standard deviation": " standard_deviation ", | |
| "variance": " variance ", | |
| "perimeter": " perimeter ", | |
| "area": " area ", | |
| "volume": " volume ", | |
| "circumference": " circumference ", | |
| "radius": " radius ", | |
| "diameter": " diameter ", | |
| "ratio of": " ratio ", | |
| "ratio": " ratio ", | |
| "proportion": " proportion ", | |
| "sum of": " sum ", | |
| "difference between": " difference ", | |
| "product of": " product ", | |
| "quotient of": " quotient ", | |
| } | |
| UNICODE_FRACTIONS: Dict[str, str] = { | |
| "Β½": "1/2", | |
| "β ": "1/3", | |
| "β ": "2/3", | |
| "ΒΌ": "1/4", | |
| "ΒΎ": "3/4", | |
| "β ": "1/5", | |
| "β ": "2/5", | |
| "β ": "3/5", | |
| "β ": "4/5", | |
| "β ": "1/6", | |
| "β ": "5/6", | |
| "β ": "1/7", | |
| "β ": "1/8", | |
| "β ": "3/8", | |
| "β ": "5/8", | |
| "β ": "7/8", | |
| "β ": "1/9", | |
| "β ": "1/10", | |
| } | |
| SUPERSCRIPT_MAP: Dict[str, str] = { | |
| "β°": "0", | |
| "ΒΉ": "1", | |
| "Β²": "2", | |
| "Β³": "3", | |
| "β΄": "4", | |
| "β΅": "5", | |
| "βΆ": "6", | |
| "β·": "7", | |
| "βΈ": "8", | |
| "βΉ": "9", | |
| "βΊ": "+", | |
| "β»": "-", | |
| } | |
| SUBSCRIPT_MAP: Dict[str, str] = { | |
| "β": "0", | |
| "β": "1", | |
| "β": "2", | |
| "β": "3", | |
| "β": "4", | |
| "β ": "5", | |
| "β": "6", | |
| "β": "7", | |
| "β": "8", | |
| "β": "9", | |
| "β": "+", | |
| "β": "-", | |
| } | |
| def _replace_unicode_fractions(text: str) -> str: | |
| for k, v in UNICODE_FRACTIONS.items(): | |
| text = text.replace(k, v) | |
| return text | |
| def _replace_superscripts_and_subscripts(text: str) -> str: | |
| out = [] | |
| i = 0 | |
| while i < len(text): | |
| ch = text[i] | |
| if ch in SUPERSCRIPT_MAP: | |
| digits = [] | |
| while i < len(text) and text[i] in SUPERSCRIPT_MAP: | |
| digits.append(SUPERSCRIPT_MAP[text[i]]) | |
| i += 1 | |
| out.append("^" + "".join(digits)) | |
| continue | |
| if ch in SUBSCRIPT_MAP: | |
| digits = [] | |
| while i < len(text) and text[i] in SUBSCRIPT_MAP: | |
| digits.append(SUBSCRIPT_MAP[text[i]]) | |
| i += 1 | |
| out.append("_" + "".join(digits)) | |
| continue | |
| out.append(ch) | |
| i += 1 | |
| return "".join(out) | |
| def _replace_symbol_chars(text: str) -> str: | |
| for k, v in SYMBOL_REPLACEMENTS.items(): | |
| text = text.replace(k, v) | |
| return text | |
| def _replace_text_phrases(text: str) -> str: | |
| # longest first so "greater than or equal to" is replaced before "greater than" | |
| for k in sorted(TEXT_REPLACEMENTS.keys(), key=len, reverse=True): | |
| text = re.sub(rf"\b{re.escape(k)}\b", TEXT_REPLACEMENTS[k], text, flags=re.I) | |
| return text | |
| def _normalize_roots(text: str) -> str: | |
| # "sqrt 9" -> "sqrt(9)" | |
| text = re.sub(r"\bsqrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"sqrt(\1)", text, flags=re.I) | |
| text = re.sub(r"\bcbrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"cbrt(\1)", text, flags=re.I) | |
| return text | |
| def _normalize_percent_expressions(text: str) -> str: | |
| # "25 percent of 80" -> "(25/100) * 80" | |
| text = re.sub( | |
| r"(\d+(?:\.\d+)?)\s*percent\s+of\s+(\d+(?:\.\d+)?)", | |
| r"(\1/100) * \2", | |
| text, | |
| flags=re.I, | |
| ) | |
| # "x percent" -> "(x/100)" | |
| text = re.sub( | |
| r"(\d+(?:\.\d+)?)\s*percent\b", | |
| r"(\1/100)", | |
| text, | |
| flags=re.I, | |
| ) | |
| # per-mille | |
| text = re.sub( | |
| r"(\d+(?:\.\d+)?)\s*permille\b", | |
| r"(\1/1000)", | |
| text, | |
| flags=re.I, | |
| ) | |
| return text | |
| def _normalize_multiplication_spacing(text: str) -> str: | |
| # 5x -> 5*x | |
| text = re.sub(r"(\d)([a-zA-Z])", r"\1*\2", text) | |
| # )x -> )*x | |
| text = re.sub(r"(\))([a-zA-Z0-9])", r"\1*\2", text) | |
| # x( -> x*( | |
| text = re.sub(r"([a-zA-Z0-9])(\()", r"\1*\2", text) | |
| return text | |
| def normalize_math_text(text: str) -> str: | |
| if not text: | |
| return "" | |
| text = unicodedata.normalize("NFKC", text) | |
| text = _replace_unicode_fractions(text) | |
| text = _replace_superscripts_and_subscripts(text) | |
| text = _replace_symbol_chars(text) | |
| text = _replace_text_phrases(text) | |
| text = _normalize_roots(text) | |
| text = _normalize_percent_expressions(text) | |
| text = _normalize_multiplication_spacing(text) | |
| # normalize repeated spaces | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def normalize_for_solver(text: str) -> str: | |
| text = normalize_math_text(text) | |
| # make some solver-oriented aliases | |
| text = text.replace("pi", "3.141592653589793") | |
| text = text.replace("approx", "~") | |
| return text | |
| def normalize_for_parser(text: str) -> str: | |
| text = normalize_math_text(text) | |
| # keep semantic tokens for router/parser | |
| return text |