j-js commited on
Commit
0e143c5
·
verified ·
1 Parent(s): 6830b8b

Create question_parser.py

Browse files
Files changed (1) hide show
  1. question_parser.py +299 -0
question_parser.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from typing import List, Optional
6
+
7
+
8
+ @dataclass
9
+ class ParsedQuestion:
10
+ raw_text: str
11
+ normalized_text: str
12
+
13
+ topic: Optional[str] = None
14
+ asks_for: Optional[str] = None
15
+
16
+ givens: List[str] = field(default_factory=list)
17
+ constraints: List[str] = field(default_factory=list)
18
+ relationships: List[str] = field(default_factory=list)
19
+ needed_concepts: List[str] = field(default_factory=list)
20
+ trap_notes: List[str] = field(default_factory=list)
21
+
22
+ numbers: List[str] = field(default_factory=list)
23
+ variables: List[str] = field(default_factory=list)
24
+
25
+ has_percent: bool = False
26
+ has_ratio: bool = False
27
+ has_equation: bool = False
28
+ has_probability: bool = False
29
+ has_geometry: bool = False
30
+ has_statistics: bool = False
31
+ has_number_properties: bool = False
32
+
33
+
34
+ def _normalize_text(text: str) -> str:
35
+ text = (text or "").strip()
36
+ text = text.replace("’", "'").replace("“", '"').replace("”", '"')
37
+ text = re.sub(r"\s+", " ", text)
38
+ return text
39
+
40
+
41
+ def _extract_numbers(text: str) -> List[str]:
42
+ return re.findall(r"\b\d+(?:\.\d+)?%?\b", text)
43
+
44
+
45
+ def _extract_variables(text: str) -> List[str]:
46
+ vars_found = re.findall(r"\b[a-z]\b", text.lower())
47
+ common_noise = {"a", "i"}
48
+ return [v for v in vars_found if v not in common_noise]
49
+
50
+
51
+ def _detect_topic(t: str) -> Optional[str]:
52
+ if "%" in t or "percent" in t or "percentage" in t:
53
+ return "percent"
54
+
55
+ if "ratio" in t or "proportion" in t or re.search(r"\b\d+\s*:\s*\d+\b", t):
56
+ return "ratio"
57
+
58
+ if any(word in t for word in ["probability", "chance", "odds", "randomly"]):
59
+ return "probability"
60
+
61
+ if any(word in t for word in ["mean", "median", "average", "mode", "standard deviation", "range"]):
62
+ return "statistics"
63
+
64
+ if any(word in t for word in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]):
65
+ return "geometry"
66
+
67
+ if any(word in t for word in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]):
68
+ return "number_theory"
69
+
70
+ if "=" in t or re.search(r"\bsolve for\b", t) or re.search(r"\bwhat is [a-z]\b", t):
71
+ return "algebra"
72
+
73
+ return None
74
+
75
+
76
+ def _detect_asks_for(t: str, topic: Optional[str]) -> Optional[str]:
77
+ lower = t.lower()
78
+
79
+ m = re.search(r"\bwhat is the value of ([a-z])\b", lower)
80
+ if m:
81
+ return f"the value of {m.group(1)}"
82
+
83
+ m = re.search(r"\bsolve for ([a-z])\b", lower)
84
+ if m:
85
+ return f"the value of {m.group(1)}"
86
+
87
+ if topic == "percent":
88
+ if "original" in lower or "whole" in lower:
89
+ return "the original whole value"
90
+ if "percent" in lower and "of" in lower:
91
+ return "the missing value in a percent relationship"
92
+ return "the unknown quantity in the percent relationship"
93
+
94
+ if topic == "ratio":
95
+ return "the missing part or total in the ratio relationship"
96
+
97
+ if topic == "probability":
98
+ return "the probability of the event"
99
+
100
+ if topic == "statistics":
101
+ return "the requested statistic"
102
+
103
+ if topic == "geometry":
104
+ return "the missing geometric quantity"
105
+
106
+ if topic == "number_theory":
107
+ return "the required number property or unknown integer"
108
+
109
+ if topic == "algebra":
110
+ return "the value of the variable"
111
+
112
+ return "the target quantity asked for in the question"
113
+
114
+
115
+ def _extract_givens(text: str, topic: Optional[str]) -> List[str]:
116
+ givens: List[str] = []
117
+
118
+ if topic == "percent":
119
+ percents = re.findall(r"\b\d+(?:\.\d+)?%", text)
120
+ if percents:
121
+ givens.append(f"A percentage is given: {', '.join(percents[:3])}")
122
+ if "of" in text.lower():
123
+ givens.append("The wording uses a part-of-whole relationship")
124
+
125
+ if topic == "ratio":
126
+ ratio_match = re.findall(r"\b\d+\s*:\s*\d+\b", text)
127
+ if ratio_match:
128
+ givens.append(f"A ratio is given: {', '.join(ratio_match[:3])}")
129
+ if "ratio" in text.lower() or "proportion" in text.lower():
130
+ givens.append("The question involves a comparison between quantities")
131
+
132
+ if topic == "algebra":
133
+ if "=" in text:
134
+ givens.append("An equation is given")
135
+ vars_found = _extract_variables(text)
136
+ if vars_found:
137
+ givens.append(f"A variable appears in the equation: {', '.join(sorted(set(vars_found))[:3])}")
138
+
139
+ if topic == "probability":
140
+ givens.append("The question describes an event and a total set of possible outcomes")
141
+
142
+ if topic == "statistics":
143
+ givens.append("The question provides values or a distribution to summarize")
144
+
145
+ if topic == "geometry":
146
+ givens.append("The question gives shape-based information")
147
+
148
+ if topic == "number_theory":
149
+ givens.append("The question gives number-property information")
150
+
151
+ numbers = _extract_numbers(text)
152
+ if numbers:
153
+ givens.append(f"Numbers mentioned: {', '.join(numbers[:5])}")
154
+
155
+ return givens
156
+
157
+
158
+ def _extract_constraints(text: str) -> List[str]:
159
+ constraints: List[str] = []
160
+ lower = text.lower()
161
+
162
+ phrases = [
163
+ "integer",
164
+ "positive",
165
+ "negative",
166
+ "nonnegative",
167
+ "distinct",
168
+ "consecutive",
169
+ "even",
170
+ "odd",
171
+ "at least",
172
+ "at most",
173
+ "greater than",
174
+ "less than",
175
+ "multiple choice",
176
+ ]
177
+
178
+ for p in phrases:
179
+ if p in lower:
180
+ constraints.append(p)
181
+
182
+ return constraints
183
+
184
+
185
+ def _extract_relationships(text: str, topic: Optional[str]) -> List[str]:
186
+ rel: List[str] = []
187
+ lower = text.lower()
188
+
189
+ if topic == "percent":
190
+ rel.append("This is a part-percent-whole relationship")
191
+ if "of" in lower:
192
+ rel.append("A quantity is being described as a percent of another quantity")
193
+
194
+ elif topic == "ratio":
195
+ rel.append("This compares quantities in a fixed proportion")
196
+ if "total" in lower:
197
+ rel.append("You may need to connect the ratio parts to a total")
198
+
199
+ elif topic == "algebra":
200
+ rel.append("The equal sign means both sides represent the same value")
201
+ rel.append("You need to isolate the variable while keeping the equation balanced")
202
+
203
+ elif topic == "probability":
204
+ rel.append("Probability compares favorable outcomes to total possible outcomes")
205
+
206
+ elif topic == "statistics":
207
+ rel.append("You need to match the question to the correct summary measure")
208
+
209
+ elif topic == "geometry":
210
+ rel.append("The quantities are linked by properties of the figure")
211
+
212
+ elif topic == "number_theory":
213
+ rel.append("The solution depends on a number rule such as divisibility or factors")
214
+
215
+ return rel
216
+
217
+
218
+ def _needed_concepts(topic: Optional[str]) -> List[str]:
219
+ if topic == "percent":
220
+ return ["percent equation", "part-whole thinking"]
221
+ if topic == "ratio":
222
+ return ["ratio structure", "part-to-part or part-to-whole setup"]
223
+ if topic == "algebra":
224
+ return ["equation balancing", "inverse operations"]
225
+ if topic == "probability":
226
+ return ["favorable outcomes", "total outcomes"]
227
+ if topic == "statistics":
228
+ return ["identify the correct statistic", "use the relevant values only"]
229
+ if topic == "geometry":
230
+ return ["figure properties", "spatial relationships"]
231
+ if topic == "number_theory":
232
+ return ["divisibility/factor rules", "integer properties"]
233
+ return []
234
+
235
+
236
+ def _trap_notes(topic: Optional[str], text: str) -> List[str]:
237
+ traps: List[str] = []
238
+ lower = text.lower()
239
+
240
+ if topic == "percent":
241
+ traps.append("Do not confuse the part with the whole")
242
+ traps.append("Check whether you are solving forward or backward")
243
+
244
+ elif topic == "ratio":
245
+ traps.append("Do not add or compare ratio parts inconsistently")
246
+ traps.append("Check whether the ratio is part-to-part or part-to-whole")
247
+
248
+ elif topic == "algebra":
249
+ traps.append("Do not perform an operation on only one side of the equation")
250
+ traps.append("Watch for distribution or sign mistakes")
251
+
252
+ elif topic == "probability":
253
+ traps.append("Do not forget the total number of possible outcomes")
254
+ traps.append("Check whether order matters")
255
+
256
+ elif topic == "statistics":
257
+ traps.append("Do not use the wrong measure")
258
+ traps.append("Check whether outliers matter")
259
+
260
+ elif topic == "geometry":
261
+ traps.append("Do not assume a figure is drawn to scale unless stated")
262
+ traps.append("Use only relationships actually given")
263
+
264
+ elif topic == "number_theory":
265
+ traps.append("Check the exact divisibility or remainder condition")
266
+ traps.append("Do not assume every integer behaves the same way")
267
+
268
+ if "except" in lower:
269
+ traps.append("Watch for exception wording")
270
+
271
+ return traps
272
+
273
+
274
+ def parse_question(text: str) -> ParsedQuestion:
275
+ normalized = _normalize_text(text)
276
+ lower = normalized.lower()
277
+
278
+ topic = _detect_topic(lower)
279
+
280
+ return ParsedQuestion(
281
+ raw_text=text,
282
+ normalized_text=normalized,
283
+ topic=topic,
284
+ asks_for=_detect_asks_for(normalized, topic),
285
+ givens=_extract_givens(normalized, topic),
286
+ constraints=_extract_constraints(normalized),
287
+ relationships=_extract_relationships(normalized, topic),
288
+ needed_concepts=_needed_concepts(topic),
289
+ trap_notes=_trap_notes(topic, normalized),
290
+ numbers=_extract_numbers(normalized),
291
+ variables=_extract_variables(normalized),
292
+ has_percent=("%" in lower or "percent" in lower or "percentage" in lower),
293
+ has_ratio=("ratio" in lower or "proportion" in lower or bool(re.search(r"\b\d+\s*:\s*\d+\b", lower))),
294
+ has_equation=("=" in lower),
295
+ has_probability=any(w in lower for w in ["probability", "chance", "odds", "randomly"]),
296
+ has_geometry=any(w in lower for w in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]),
297
+ has_statistics=any(w in lower for w in ["mean", "median", "average", "mode", "standard deviation", "range"]),
298
+ has_number_properties=any(w in lower for w in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]),
299
+ )