File size: 5,901 Bytes
565a379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac078d2
565a379
 
 
 
 
 
 
 
 
3a4bdd3
 
 
565a379
 
3a4bdd3
565a379
 
 
 
 
 
3a4bdd3
 
 
 
 
 
 
 
 
 
 
 
 
565a379
3a4bdd3
 
 
 
 
 
ac078d2
 
3a4bdd3
 
 
 
 
 
 
565a379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a4bdd3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
from typing import Optional, Dict, Any
from dataclasses import dataclass
import logging

logger = logging.getLogger(__name__)

@dataclass
class MathIntent:
    intent: str
    expression: str
    variable: Optional[str] = 'x'
    original_query: str = ""

class MathQueryNormalizer:
    """
    Normalizes natural language math queries into structured intents
    resolvable by symbolic solvers.
    """
    
    def __init__(self):
        self.intent_patterns = {
            "derivative": [
                r"derivative of\s+(.+)",
                r"derive\s+(.+)",
                r"differentiate\s+(.+)",
                r"d/dx\s*\[?(.+)\]?"
            ],
            "integral": [
                r"integral of\s+(.+)",
                r"integrate\s+(.+)",
                r"antiderivative of\s+(.+)"
            ],
            "limit": [
                r"limit of\s+(.+)\s+as\s+(\w+)\s+approaches\s+(.+)",
                r"lim\s+(.+)"
            ],
            "equation": [
                r"solve\s+(.+)",
                r"find x for\s+(.+)", # basic
                r"(.+)=(.+)" # implicit equation if contains =
            ],
            "arithmetic": [
                r"calculate\s+(.+)",
                r"what is\s+(.+)",
                r"evaluate\s+(.+)"
            ]
        }
        
        # Stop words to clean from expression
        self.stop_words = ["what is", "calculate", "the", "please", "solve", "evaluate"]

    def normalize(self, text: str) -> Optional[MathIntent]:
        """
        Parses text to identify math intent and extract the core expression.
        Returns None if no clear math intent is found.
        """
        if not text:
            return None
            
        clean_text = text.lower().strip().rstrip("?")
        
        # 1. Check specific intents
        
        # Derivative
        for pattern in self.intent_patterns["derivative"]:
            match = re.search(pattern, clean_text)
            if match:
                # Group 1 is usually the expression
                raw_expr = match.group(1)
                expr = self._clean_expression(raw_expr)
                return MathIntent(
                    intent="derivative",
                    expression=expr,
                    variable='x', 
                    original_query=text
                )

        # Integral
        for pattern in self.intent_patterns["integral"]:
            match = re.search(pattern, clean_text)
            if match:
                raw_expr = match.group(1)
                expr = self._clean_expression(raw_expr)
                return MathIntent(
                    intent="integral",
                    expression=expr,
                    variable='x',
                    original_query=text
                )
                
        # Equation Solving
        if "=" in clean_text:
             expr = self._clean_expression(clean_text)
             
             return MathIntent(
                 intent="equation",
                 expression=expr.strip(),
                 variable='x',
                 original_query=text
             )

        # Arithmetic / Simplification
        # If it contains numbers and operators, or starts with "calculate", "what is"
        if self._is_arithmetic(clean_text) or any(clean_text.startswith(sw) for sw in ["calculate", "what is", "evaluate"]):
             expr = self._clean_expression(clean_text)
             return MathIntent(
                 intent="arithmetic",
                 expression=expr,
                 original_query=text
             )

        return None

    def _clean_expression(self, text: str) -> str:
        """
        Removes natural language words from an expression, leaving only
        the mathematical notation SymPy can safely parse.

        ROOT CAUSE FIX: the previous version only stripped stop words from
        the START of the string. So "what is the value of 5*9" became
        "the value of 5*9" — SymPy then treated t, h, e, v, a, l, u, e, o, f
        as separate symbols and multiplied them: 45·a·e²·f·h·l·o·t·u·v.
        That's the "45aeflouv" garble seen on the UI.

        Fix: strip ALL known English prose words, not just from the start.
        """
        import re
        text = text.strip()

        # Full list of prose words to remove wherever they appear
        prose_words = [
            "what is", "what are", "the value of", "the result of",
            "please", "calculate", "compute", "evaluate", "find",
            "solve", "simplify", "determine", "the", "of", "for",
            "result", "value", "answer", "how do i", "how to", "i", "can you",
            "find x", "how do we", "we", "do", "how",
        ]
        for phrase in sorted(prose_words, key=len, reverse=True):  # longest first
            text = re.sub(rf'\b{re.escape(phrase)}\b', ' ', text, flags=re.IGNORECASE)

        # Collapse multiple spaces
        text = re.sub(r' +', ' ', text).strip()
        return text

    def _is_arithmetic(self, text: str) -> bool:
        """
        Checks if text is primarily arithmetic (numbers, operators).
        """
        # Allow basic math chars
        allowed_chars = set("0123456789+-*/^().= \t")
        
        # 1. Must contain at least one digit
        if not any(c.isdigit() for c in text):
            return False
            
        # 2. Must only contain allowed chars [and maybe 'x', 'y' for algebra?]
        # Let's be strict for "arithmetic" intent, looser for "algebra" if we had it.
        # But user query "2x + 5" is algebra/simplification.
        
        # Let's allow algebraic vars for simplification
        allowed_chars.update(set("xyzabc")) 
        
        # Check if characters are valid
        for char in text:
            if char not in allowed_chars:
                return False
                
        return True