Spaces:

j-js
/

TradingGameAI

Running

App Files Files Community

TradingGameAI / math_normalizer.py

j-js

Create math_normalizer.py

e7c7270 verified about 24 hours ago

raw

history blame contribute delete

8.27 kB

	from __future__ import annotations

	import re
	import unicodedata
	from typing import Dict


	SYMBOL_REPLACEMENTS: Dict[str, str] = {
	# Equality / comparison
	"=": "=",
	"≠": " !=", # keep spaced form easy to regex
	"≈": " approx ",
	"~": " approx ",
	"≡": " equivalent ",
	"≜": " = ",
	":=": " = ",
	">": " > ",
	"<": " < ",
	"≥": " >= ",
	"≤": " <= ",
	"≪": " << ",
	"≫": " >> ",

	# Arithmetic operators
	"+": " + ",
	"−": " - ",
	"–": " - ",
	"—": " - ",
	"-": " - ",
	"‒": " - ",
	"±": " plus_minus ",
	"∓": " minus_plus ",
	"": " ",
	"×": " * ",
	"⋅": " * ",
	"·": " * ",
	"÷": " / ",
	"/": " / ",
	"∕": " / ",
	"⁄": " / ",

	# Brackets / grouping
	"[": "(",
	"]": ")",
	"{": " { ",
	"}": " } ",
	"⌊": " floor(",
	"⌋": ")",
	"⌈": " ceil(",
	"⌉": ")",

	# Powers / roots
	"^": "^",
	"²": "^2",
	"³": "^3",
	"⁴": "^4",
	"⁵": "^5",
	"⁶": "^6",
	"⁷": "^7",
	"⁸": "^8",
	"⁹": "^9",
	"⁰": "^0",
	"¹": "^1",
	"√": " sqrt ",
	"∛": " cbrt ",
	"∜": " fourth_root ",

	# Percent / rates
	"%": " percent ",
	"‰": " permille ",
	"‱": " permyriad ",

	# Geometry
	"∠": " angle ",
	"∟": " right_angle ",
	"°": " degrees ",
	"′": " prime ",
	"″": " double_prime ",
	"⊥": " perpendicular ",
	"∥": " parallel ",
	"≅": " congruent ",
	"Δ": " triangle ",
	"△": " triangle ",
	"π": " pi ",

	# Algebra / calculus-ish
	"∞": " infinity ",
	"∝": " proportional_to ",
	"∆": " delta ",
	"∑": " sum ",
	"∏": " product ",
	"∫": " integral ",

	# Probability / sets
	"∩": " intersection ",
	"∪": " union ",
	"⊆": " subseteq ",
	"⊂": " subset ",
	"∈": " in ",
	"∉": " not_in ",
	"∅": " empty_set ",
	"\|": " \| ",

	# Common OCR / typography junk
	"“": '"',
	"”": '"',
	"‘": "'",
	"’": "'",
	"…": "...",
	"\u00a0": " ", # non-breaking space
	}


	TEXT_REPLACEMENTS: Dict[str, str] = {
	# Verbal math phrases -> more parseable forms
	"divided by": " / ",
	"multiplied by": " * ",
	"times": " * ",
	"plus": " + ",
	"minus": " - ",
	"equals": " = ",
	"is equal to": " = ",
	"is greater than or equal to": " >= ",
	"is less than or equal to": " <= ",
	"greater than or equal to": " >= ",
	"less than or equal to": " <= ",
	"greater than": " > ",
	"less than": " < ",
	"not equal to": " != ",
	"approximately equal to": " approx ",
	"approx equal to": " approx ",
	"squared": "^2",
	"cubed": "^3",
	"square root of": " sqrt ",
	"cube root of": " cbrt ",
	"to the power of": "^",
	"raised to the power of": "^",
	"percent": " percent ",
	"per cent": " percent ",
	"percentage": " percent ",
	"remainder when": " remainder ",
	"is divisible by": " divisible_by ",
	"divisible by": " divisible_by ",
	"is a multiple of": " multiple_of ",
	"multiple of": " multiple_of ",
	"factor of": " factor_of ",
	"prime number": " prime ",
	"consecutive integers": " consecutive_integers ",
	"positive integer": " positive_integer ",
	"negative integer": " negative_integer ",
	"at least": " >= ",
	"at most": " <= ",
	"no more than": " <= ",
	"no less than": " >= ",
	"more than": " > ",
	"fewer than": " < ",
	"probability of": " probability ",
	"mean": " mean ",
	"average": " average ",
	"median": " median ",
	"mode": " mode ",
	"standard deviation": " standard_deviation ",
	"variance": " variance ",
	"perimeter": " perimeter ",
	"area": " area ",
	"volume": " volume ",
	"circumference": " circumference ",
	"radius": " radius ",
	"diameter": " diameter ",
	"ratio of": " ratio ",
	"ratio": " ratio ",
	"proportion": " proportion ",
	"sum of": " sum ",
	"difference between": " difference ",
	"product of": " product ",
	"quotient of": " quotient ",
	}


	UNICODE_FRACTIONS: Dict[str, str] = {
	"½": "1/2",
	"⅓": "1/3",
	"⅔": "2/3",
	"¼": "1/4",
	"¾": "3/4",
	"⅕": "1/5",
	"⅖": "2/5",
	"⅗": "3/5",
	"⅘": "4/5",
	"⅙": "1/6",
	"⅚": "5/6",
	"⅐": "1/7",
	"⅛": "1/8",
	"⅜": "3/8",
	"⅝": "5/8",
	"⅞": "7/8",
	"⅑": "1/9",
	"⅒": "1/10",
	}


	SUPERSCRIPT_MAP: Dict[str, str] = {
	"⁰": "0",
	"¹": "1",
	"²": "2",
	"³": "3",
	"⁴": "4",
	"⁵": "5",
	"⁶": "6",
	"⁷": "7",
	"⁸": "8",
	"⁹": "9",
	"⁺": "+",
	"⁻": "-",
	}

	SUBSCRIPT_MAP: Dict[str, str] = {
	"₀": "0",
	"₁": "1",
	"₂": "2",
	"₃": "3",
	"₄": "4",
	"₅": "5",
	"₆": "6",
	"₇": "7",
	"₈": "8",
	"₉": "9",
	"₊": "+",
	"₋": "-",
	}


	def _replace_unicode_fractions(text: str) -> str:
	for k, v in UNICODE_FRACTIONS.items():
	text = text.replace(k, v)
	return text


	def _replace_superscripts_and_subscripts(text: str) -> str:
	out = []
	i = 0
	while i < len(text):
	ch = text[i]

	if ch in SUPERSCRIPT_MAP:
	digits = []
	while i < len(text) and text[i] in SUPERSCRIPT_MAP:
	digits.append(SUPERSCRIPT_MAP[text[i]])
	i += 1
	out.append("^" + "".join(digits))
	continue

	if ch in SUBSCRIPT_MAP:
	digits = []
	while i < len(text) and text[i] in SUBSCRIPT_MAP:
	digits.append(SUBSCRIPT_MAP[text[i]])
	i += 1
	out.append("_" + "".join(digits))
	continue

	out.append(ch)
	i += 1

	return "".join(out)


	def _replace_symbol_chars(text: str) -> str:
	for k, v in SYMBOL_REPLACEMENTS.items():
	text = text.replace(k, v)
	return text


	def _replace_text_phrases(text: str) -> str:
	# longest first so "greater than or equal to" is replaced before "greater than"
	for k in sorted(TEXT_REPLACEMENTS.keys(), key=len, reverse=True):
	text = re.sub(rf"\b{re.escape(k)}\b", TEXT_REPLACEMENTS[k], text, flags=re.I)
	return text


	def _normalize_roots(text: str) -> str:
	# "sqrt 9" -> "sqrt(9)"
	text = re.sub(r"\bsqrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"sqrt(\1)", text, flags=re.I)
	text = re.sub(r"\bcbrt\s+([a-z0-9\(\)\/\+\-\*\.]+)", r"cbrt(\1)", text, flags=re.I)
	return text


	def _normalize_percent_expressions(text: str) -> str:
	# "25 percent of 80" -> "(25/100) * 80"
	text = re.sub(
	r"(\d+(?:\.\d+)?)\s*percent\s+of\s+(\d+(?:\.\d+)?)",
	r"(\1/100) * \2",
	text,
	flags=re.I,
	)

	# "x percent" -> "(x/100)"
	text = re.sub(
	r"(\d+(?:\.\d+)?)\s*percent\b",
	r"(\1/100)",
	text,
	flags=re.I,
	)

	# per-mille
	text = re.sub(
	r"(\d+(?:\.\d+)?)\s*permille\b",
	r"(\1/1000)",
	text,
	flags=re.I,
	)
	return text


	def _normalize_multiplication_spacing(text: str) -> str:
	# 5x -> 5*x
	text = re.sub(r"(\d)([a-zA-Z])", r"\1*\2", text)
	# )x -> )*x
	text = re.sub(r"(\))([a-zA-Z0-9])", r"\1*\2", text)
	# x( -> x*(
	text = re.sub(r"([a-zA-Z0-9])(\()", r"\1*\2", text)
	return text


	def normalize_math_text(text: str) -> str:
	if not text:
	return ""

	text = unicodedata.normalize("NFKC", text)
	text = _replace_unicode_fractions(text)
	text = _replace_superscripts_and_subscripts(text)
	text = _replace_symbol_chars(text)
	text = _replace_text_phrases(text)
	text = _normalize_roots(text)
	text = _normalize_percent_expressions(text)
	text = _normalize_multiplication_spacing(text)

	# normalize repeated spaces
	text = re.sub(r"\s+", " ", text).strip()

	return text


	def normalize_for_solver(text: str) -> str:
	text = normalize_math_text(text)

	# make some solver-oriented aliases
	text = text.replace("pi", "3.141592653589793")
	text = text.replace("approx", "~")

	return text


	def normalize_for_parser(text: str) -> str:
	text = normalize_math_text(text)

	# keep semantic tokens for router/parser
	return text