Create question_parser.py
Browse files- question_parser.py +299 -0
question_parser.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class ParsedQuestion:
|
| 10 |
+
raw_text: str
|
| 11 |
+
normalized_text: str
|
| 12 |
+
|
| 13 |
+
topic: Optional[str] = None
|
| 14 |
+
asks_for: Optional[str] = None
|
| 15 |
+
|
| 16 |
+
givens: List[str] = field(default_factory=list)
|
| 17 |
+
constraints: List[str] = field(default_factory=list)
|
| 18 |
+
relationships: List[str] = field(default_factory=list)
|
| 19 |
+
needed_concepts: List[str] = field(default_factory=list)
|
| 20 |
+
trap_notes: List[str] = field(default_factory=list)
|
| 21 |
+
|
| 22 |
+
numbers: List[str] = field(default_factory=list)
|
| 23 |
+
variables: List[str] = field(default_factory=list)
|
| 24 |
+
|
| 25 |
+
has_percent: bool = False
|
| 26 |
+
has_ratio: bool = False
|
| 27 |
+
has_equation: bool = False
|
| 28 |
+
has_probability: bool = False
|
| 29 |
+
has_geometry: bool = False
|
| 30 |
+
has_statistics: bool = False
|
| 31 |
+
has_number_properties: bool = False
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _normalize_text(text: str) -> str:
|
| 35 |
+
text = (text or "").strip()
|
| 36 |
+
text = text.replace("’", "'").replace("“", '"').replace("”", '"')
|
| 37 |
+
text = re.sub(r"\s+", " ", text)
|
| 38 |
+
return text
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _extract_numbers(text: str) -> List[str]:
|
| 42 |
+
return re.findall(r"\b\d+(?:\.\d+)?%?\b", text)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _extract_variables(text: str) -> List[str]:
|
| 46 |
+
vars_found = re.findall(r"\b[a-z]\b", text.lower())
|
| 47 |
+
common_noise = {"a", "i"}
|
| 48 |
+
return [v for v in vars_found if v not in common_noise]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _detect_topic(t: str) -> Optional[str]:
|
| 52 |
+
if "%" in t or "percent" in t or "percentage" in t:
|
| 53 |
+
return "percent"
|
| 54 |
+
|
| 55 |
+
if "ratio" in t or "proportion" in t or re.search(r"\b\d+\s*:\s*\d+\b", t):
|
| 56 |
+
return "ratio"
|
| 57 |
+
|
| 58 |
+
if any(word in t for word in ["probability", "chance", "odds", "randomly"]):
|
| 59 |
+
return "probability"
|
| 60 |
+
|
| 61 |
+
if any(word in t for word in ["mean", "median", "average", "mode", "standard deviation", "range"]):
|
| 62 |
+
return "statistics"
|
| 63 |
+
|
| 64 |
+
if any(word in t for word in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]):
|
| 65 |
+
return "geometry"
|
| 66 |
+
|
| 67 |
+
if any(word in t for word in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]):
|
| 68 |
+
return "number_theory"
|
| 69 |
+
|
| 70 |
+
if "=" in t or re.search(r"\bsolve for\b", t) or re.search(r"\bwhat is [a-z]\b", t):
|
| 71 |
+
return "algebra"
|
| 72 |
+
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _detect_asks_for(t: str, topic: Optional[str]) -> Optional[str]:
|
| 77 |
+
lower = t.lower()
|
| 78 |
+
|
| 79 |
+
m = re.search(r"\bwhat is the value of ([a-z])\b", lower)
|
| 80 |
+
if m:
|
| 81 |
+
return f"the value of {m.group(1)}"
|
| 82 |
+
|
| 83 |
+
m = re.search(r"\bsolve for ([a-z])\b", lower)
|
| 84 |
+
if m:
|
| 85 |
+
return f"the value of {m.group(1)}"
|
| 86 |
+
|
| 87 |
+
if topic == "percent":
|
| 88 |
+
if "original" in lower or "whole" in lower:
|
| 89 |
+
return "the original whole value"
|
| 90 |
+
if "percent" in lower and "of" in lower:
|
| 91 |
+
return "the missing value in a percent relationship"
|
| 92 |
+
return "the unknown quantity in the percent relationship"
|
| 93 |
+
|
| 94 |
+
if topic == "ratio":
|
| 95 |
+
return "the missing part or total in the ratio relationship"
|
| 96 |
+
|
| 97 |
+
if topic == "probability":
|
| 98 |
+
return "the probability of the event"
|
| 99 |
+
|
| 100 |
+
if topic == "statistics":
|
| 101 |
+
return "the requested statistic"
|
| 102 |
+
|
| 103 |
+
if topic == "geometry":
|
| 104 |
+
return "the missing geometric quantity"
|
| 105 |
+
|
| 106 |
+
if topic == "number_theory":
|
| 107 |
+
return "the required number property or unknown integer"
|
| 108 |
+
|
| 109 |
+
if topic == "algebra":
|
| 110 |
+
return "the value of the variable"
|
| 111 |
+
|
| 112 |
+
return "the target quantity asked for in the question"
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _extract_givens(text: str, topic: Optional[str]) -> List[str]:
|
| 116 |
+
givens: List[str] = []
|
| 117 |
+
|
| 118 |
+
if topic == "percent":
|
| 119 |
+
percents = re.findall(r"\b\d+(?:\.\d+)?%", text)
|
| 120 |
+
if percents:
|
| 121 |
+
givens.append(f"A percentage is given: {', '.join(percents[:3])}")
|
| 122 |
+
if "of" in text.lower():
|
| 123 |
+
givens.append("The wording uses a part-of-whole relationship")
|
| 124 |
+
|
| 125 |
+
if topic == "ratio":
|
| 126 |
+
ratio_match = re.findall(r"\b\d+\s*:\s*\d+\b", text)
|
| 127 |
+
if ratio_match:
|
| 128 |
+
givens.append(f"A ratio is given: {', '.join(ratio_match[:3])}")
|
| 129 |
+
if "ratio" in text.lower() or "proportion" in text.lower():
|
| 130 |
+
givens.append("The question involves a comparison between quantities")
|
| 131 |
+
|
| 132 |
+
if topic == "algebra":
|
| 133 |
+
if "=" in text:
|
| 134 |
+
givens.append("An equation is given")
|
| 135 |
+
vars_found = _extract_variables(text)
|
| 136 |
+
if vars_found:
|
| 137 |
+
givens.append(f"A variable appears in the equation: {', '.join(sorted(set(vars_found))[:3])}")
|
| 138 |
+
|
| 139 |
+
if topic == "probability":
|
| 140 |
+
givens.append("The question describes an event and a total set of possible outcomes")
|
| 141 |
+
|
| 142 |
+
if topic == "statistics":
|
| 143 |
+
givens.append("The question provides values or a distribution to summarize")
|
| 144 |
+
|
| 145 |
+
if topic == "geometry":
|
| 146 |
+
givens.append("The question gives shape-based information")
|
| 147 |
+
|
| 148 |
+
if topic == "number_theory":
|
| 149 |
+
givens.append("The question gives number-property information")
|
| 150 |
+
|
| 151 |
+
numbers = _extract_numbers(text)
|
| 152 |
+
if numbers:
|
| 153 |
+
givens.append(f"Numbers mentioned: {', '.join(numbers[:5])}")
|
| 154 |
+
|
| 155 |
+
return givens
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _extract_constraints(text: str) -> List[str]:
|
| 159 |
+
constraints: List[str] = []
|
| 160 |
+
lower = text.lower()
|
| 161 |
+
|
| 162 |
+
phrases = [
|
| 163 |
+
"integer",
|
| 164 |
+
"positive",
|
| 165 |
+
"negative",
|
| 166 |
+
"nonnegative",
|
| 167 |
+
"distinct",
|
| 168 |
+
"consecutive",
|
| 169 |
+
"even",
|
| 170 |
+
"odd",
|
| 171 |
+
"at least",
|
| 172 |
+
"at most",
|
| 173 |
+
"greater than",
|
| 174 |
+
"less than",
|
| 175 |
+
"multiple choice",
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
for p in phrases:
|
| 179 |
+
if p in lower:
|
| 180 |
+
constraints.append(p)
|
| 181 |
+
|
| 182 |
+
return constraints
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _extract_relationships(text: str, topic: Optional[str]) -> List[str]:
|
| 186 |
+
rel: List[str] = []
|
| 187 |
+
lower = text.lower()
|
| 188 |
+
|
| 189 |
+
if topic == "percent":
|
| 190 |
+
rel.append("This is a part-percent-whole relationship")
|
| 191 |
+
if "of" in lower:
|
| 192 |
+
rel.append("A quantity is being described as a percent of another quantity")
|
| 193 |
+
|
| 194 |
+
elif topic == "ratio":
|
| 195 |
+
rel.append("This compares quantities in a fixed proportion")
|
| 196 |
+
if "total" in lower:
|
| 197 |
+
rel.append("You may need to connect the ratio parts to a total")
|
| 198 |
+
|
| 199 |
+
elif topic == "algebra":
|
| 200 |
+
rel.append("The equal sign means both sides represent the same value")
|
| 201 |
+
rel.append("You need to isolate the variable while keeping the equation balanced")
|
| 202 |
+
|
| 203 |
+
elif topic == "probability":
|
| 204 |
+
rel.append("Probability compares favorable outcomes to total possible outcomes")
|
| 205 |
+
|
| 206 |
+
elif topic == "statistics":
|
| 207 |
+
rel.append("You need to match the question to the correct summary measure")
|
| 208 |
+
|
| 209 |
+
elif topic == "geometry":
|
| 210 |
+
rel.append("The quantities are linked by properties of the figure")
|
| 211 |
+
|
| 212 |
+
elif topic == "number_theory":
|
| 213 |
+
rel.append("The solution depends on a number rule such as divisibility or factors")
|
| 214 |
+
|
| 215 |
+
return rel
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def _needed_concepts(topic: Optional[str]) -> List[str]:
|
| 219 |
+
if topic == "percent":
|
| 220 |
+
return ["percent equation", "part-whole thinking"]
|
| 221 |
+
if topic == "ratio":
|
| 222 |
+
return ["ratio structure", "part-to-part or part-to-whole setup"]
|
| 223 |
+
if topic == "algebra":
|
| 224 |
+
return ["equation balancing", "inverse operations"]
|
| 225 |
+
if topic == "probability":
|
| 226 |
+
return ["favorable outcomes", "total outcomes"]
|
| 227 |
+
if topic == "statistics":
|
| 228 |
+
return ["identify the correct statistic", "use the relevant values only"]
|
| 229 |
+
if topic == "geometry":
|
| 230 |
+
return ["figure properties", "spatial relationships"]
|
| 231 |
+
if topic == "number_theory":
|
| 232 |
+
return ["divisibility/factor rules", "integer properties"]
|
| 233 |
+
return []
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def _trap_notes(topic: Optional[str], text: str) -> List[str]:
|
| 237 |
+
traps: List[str] = []
|
| 238 |
+
lower = text.lower()
|
| 239 |
+
|
| 240 |
+
if topic == "percent":
|
| 241 |
+
traps.append("Do not confuse the part with the whole")
|
| 242 |
+
traps.append("Check whether you are solving forward or backward")
|
| 243 |
+
|
| 244 |
+
elif topic == "ratio":
|
| 245 |
+
traps.append("Do not add or compare ratio parts inconsistently")
|
| 246 |
+
traps.append("Check whether the ratio is part-to-part or part-to-whole")
|
| 247 |
+
|
| 248 |
+
elif topic == "algebra":
|
| 249 |
+
traps.append("Do not perform an operation on only one side of the equation")
|
| 250 |
+
traps.append("Watch for distribution or sign mistakes")
|
| 251 |
+
|
| 252 |
+
elif topic == "probability":
|
| 253 |
+
traps.append("Do not forget the total number of possible outcomes")
|
| 254 |
+
traps.append("Check whether order matters")
|
| 255 |
+
|
| 256 |
+
elif topic == "statistics":
|
| 257 |
+
traps.append("Do not use the wrong measure")
|
| 258 |
+
traps.append("Check whether outliers matter")
|
| 259 |
+
|
| 260 |
+
elif topic == "geometry":
|
| 261 |
+
traps.append("Do not assume a figure is drawn to scale unless stated")
|
| 262 |
+
traps.append("Use only relationships actually given")
|
| 263 |
+
|
| 264 |
+
elif topic == "number_theory":
|
| 265 |
+
traps.append("Check the exact divisibility or remainder condition")
|
| 266 |
+
traps.append("Do not assume every integer behaves the same way")
|
| 267 |
+
|
| 268 |
+
if "except" in lower:
|
| 269 |
+
traps.append("Watch for exception wording")
|
| 270 |
+
|
| 271 |
+
return traps
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def parse_question(text: str) -> ParsedQuestion:
|
| 275 |
+
normalized = _normalize_text(text)
|
| 276 |
+
lower = normalized.lower()
|
| 277 |
+
|
| 278 |
+
topic = _detect_topic(lower)
|
| 279 |
+
|
| 280 |
+
return ParsedQuestion(
|
| 281 |
+
raw_text=text,
|
| 282 |
+
normalized_text=normalized,
|
| 283 |
+
topic=topic,
|
| 284 |
+
asks_for=_detect_asks_for(normalized, topic),
|
| 285 |
+
givens=_extract_givens(normalized, topic),
|
| 286 |
+
constraints=_extract_constraints(normalized),
|
| 287 |
+
relationships=_extract_relationships(normalized, topic),
|
| 288 |
+
needed_concepts=_needed_concepts(topic),
|
| 289 |
+
trap_notes=_trap_notes(topic, normalized),
|
| 290 |
+
numbers=_extract_numbers(normalized),
|
| 291 |
+
variables=_extract_variables(normalized),
|
| 292 |
+
has_percent=("%" in lower or "percent" in lower or "percentage" in lower),
|
| 293 |
+
has_ratio=("ratio" in lower or "proportion" in lower or bool(re.search(r"\b\d+\s*:\s*\d+\b", lower))),
|
| 294 |
+
has_equation=("=" in lower),
|
| 295 |
+
has_probability=any(w in lower for w in ["probability", "chance", "odds", "randomly"]),
|
| 296 |
+
has_geometry=any(w in lower for w in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]),
|
| 297 |
+
has_statistics=any(w in lower for w in ["mean", "median", "average", "mode", "standard deviation", "range"]),
|
| 298 |
+
has_number_properties=any(w in lower for w in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]),
|
| 299 |
+
)
|