mathstutor / app /core /math_normalizer.py
ghadgemadhuri92's picture
Added 3-Mode Tutor architecture, Persistent Session Memory, and YOLOv8 Vision pipeline
ac078d2
import re
from typing import Optional, Dict, Any
from dataclasses import dataclass
import logging
logger = logging.getLogger(__name__)
@dataclass
class MathIntent:
intent: str
expression: str
variable: Optional[str] = 'x'
original_query: str = ""
class MathQueryNormalizer:
"""
Normalizes natural language math queries into structured intents
resolvable by symbolic solvers.
"""
def __init__(self):
self.intent_patterns = {
"derivative": [
r"derivative of\s+(.+)",
r"derive\s+(.+)",
r"differentiate\s+(.+)",
r"d/dx\s*\[?(.+)\]?"
],
"integral": [
r"integral of\s+(.+)",
r"integrate\s+(.+)",
r"antiderivative of\s+(.+)"
],
"limit": [
r"limit of\s+(.+)\s+as\s+(\w+)\s+approaches\s+(.+)",
r"lim\s+(.+)"
],
"equation": [
r"solve\s+(.+)",
r"find x for\s+(.+)", # basic
r"(.+)=(.+)" # implicit equation if contains =
],
"arithmetic": [
r"calculate\s+(.+)",
r"what is\s+(.+)",
r"evaluate\s+(.+)"
]
}
# Stop words to clean from expression
self.stop_words = ["what is", "calculate", "the", "please", "solve", "evaluate"]
def normalize(self, text: str) -> Optional[MathIntent]:
"""
Parses text to identify math intent and extract the core expression.
Returns None if no clear math intent is found.
"""
if not text:
return None
clean_text = text.lower().strip().rstrip("?")
# 1. Check specific intents
# Derivative
for pattern in self.intent_patterns["derivative"]:
match = re.search(pattern, clean_text)
if match:
# Group 1 is usually the expression
raw_expr = match.group(1)
expr = self._clean_expression(raw_expr)
return MathIntent(
intent="derivative",
expression=expr,
variable='x',
original_query=text
)
# Integral
for pattern in self.intent_patterns["integral"]:
match = re.search(pattern, clean_text)
if match:
raw_expr = match.group(1)
expr = self._clean_expression(raw_expr)
return MathIntent(
intent="integral",
expression=expr,
variable='x',
original_query=text
)
# Equation Solving
if "=" in clean_text:
expr = self._clean_expression(clean_text)
return MathIntent(
intent="equation",
expression=expr.strip(),
variable='x',
original_query=text
)
# Arithmetic / Simplification
# If it contains numbers and operators, or starts with "calculate", "what is"
if self._is_arithmetic(clean_text) or any(clean_text.startswith(sw) for sw in ["calculate", "what is", "evaluate"]):
expr = self._clean_expression(clean_text)
return MathIntent(
intent="arithmetic",
expression=expr,
original_query=text
)
return None
def _clean_expression(self, text: str) -> str:
"""
Removes natural language words from an expression, leaving only
the mathematical notation SymPy can safely parse.
ROOT CAUSE FIX: the previous version only stripped stop words from
the START of the string. So "what is the value of 5*9" became
"the value of 5*9" — SymPy then treated t, h, e, v, a, l, u, e, o, f
as separate symbols and multiplied them: 45·a·e²·f·h·l·o·t·u·v.
That's the "45aeflouv" garble seen on the UI.
Fix: strip ALL known English prose words, not just from the start.
"""
import re
text = text.strip()
# Full list of prose words to remove wherever they appear
prose_words = [
"what is", "what are", "the value of", "the result of",
"please", "calculate", "compute", "evaluate", "find",
"solve", "simplify", "determine", "the", "of", "for",
"result", "value", "answer", "how do i", "how to", "i", "can you",
"find x", "how do we", "we", "do", "how",
]
for phrase in sorted(prose_words, key=len, reverse=True): # longest first
text = re.sub(rf'\b{re.escape(phrase)}\b', ' ', text, flags=re.IGNORECASE)
# Collapse multiple spaces
text = re.sub(r' +', ' ', text).strip()
return text
def _is_arithmetic(self, text: str) -> bool:
"""
Checks if text is primarily arithmetic (numbers, operators).
"""
# Allow basic math chars
allowed_chars = set("0123456789+-*/^().= \t")
# 1. Must contain at least one digit
if not any(c.isdigit() for c in text):
return False
# 2. Must only contain allowed chars [and maybe 'x', 'y' for algebra?]
# Let's be strict for "arithmetic" intent, looser for "algebra" if we had it.
# But user query "2x + 5" is algebra/simplification.
# Let's allow algebraic vars for simplification
allowed_chars.update(set("xyzabc"))
# Check if characters are valid
for char in text:
if char not in allowed_chars:
return False
return True