from __future__ import annotations import re from dataclasses import dataclass, field from typing import List, Optional @dataclass class ParsedQuestion: raw_text: str normalized_text: str topic: Optional[str] = None asks_for: Optional[str] = None givens: List[str] = field(default_factory=list) constraints: List[str] = field(default_factory=list) relationships: List[str] = field(default_factory=list) needed_concepts: List[str] = field(default_factory=list) trap_notes: List[str] = field(default_factory=list) numbers: List[str] = field(default_factory=list) variables: List[str] = field(default_factory=list) has_percent: bool = False has_ratio: bool = False has_equation: bool = False has_probability: bool = False has_geometry: bool = False has_statistics: bool = False has_number_properties: bool = False def _normalize_text(text: str) -> str: text = (text or "").strip() text = text.replace("’", "'").replace("“", '"').replace("”", '"') text = re.sub(r"\s+", " ", text) return text def _extract_numbers(text: str) -> List[str]: return re.findall(r"\b\d+(?:\.\d+)?%?\b", text) def _extract_variables(text: str) -> List[str]: vars_found = re.findall(r"\b[a-z]\b", text.lower()) common_noise = {"a", "i"} return [v for v in vars_found if v not in common_noise] def _detect_topic(t: str) -> Optional[str]: if "%" in t or "percent" in t or "percentage" in t: return "percent" if "ratio" in t or "proportion" in t or re.search(r"\b\d+\s*:\s*\d+\b", t): return "ratio" if any(word in t for word in ["probability", "chance", "odds", "randomly"]): return "probability" if any(word in t for word in ["mean", "median", "average", "mode", "standard deviation", "range"]): return "statistics" if any(word in t for word in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]): return "geometry" if any(word in t for word in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]): return "number_theory" if "=" in t or re.search(r"\bsolve for\b", t) or re.search(r"\bwhat is [a-z]\b", t): return "algebra" return None def _detect_asks_for(t: str, topic: Optional[str]) -> Optional[str]: lower = t.lower() m = re.search(r"\bwhat is the value of ([a-z])\b", lower) if m: return f"the value of {m.group(1)}" m = re.search(r"\bsolve for ([a-z])\b", lower) if m: return f"the value of {m.group(1)}" if topic == "percent": if "original" in lower or "whole" in lower: return "the original whole value" if "percent" in lower and "of" in lower: return "the missing value in a percent relationship" return "the unknown quantity in the percent relationship" if topic == "ratio": return "the missing part or total in the ratio relationship" if topic == "probability": return "the probability of the event" if topic == "statistics": return "the requested statistic" if topic == "geometry": return "the missing geometric quantity" if topic == "number_theory": return "the required number property or unknown integer" if topic == "algebra": return "the value of the variable" return "the target quantity asked for in the question" def _extract_givens(text: str, topic: Optional[str]) -> List[str]: givens: List[str] = [] if topic == "percent": percents = re.findall(r"\b\d+(?:\.\d+)?%", text) if percents: givens.append(f"A percentage is given: {', '.join(percents[:3])}") if "of" in text.lower(): givens.append("The wording uses a part-of-whole relationship") if topic == "ratio": ratio_match = re.findall(r"\b\d+\s*:\s*\d+\b", text) if ratio_match: givens.append(f"A ratio is given: {', '.join(ratio_match[:3])}") if "ratio" in text.lower() or "proportion" in text.lower(): givens.append("The question involves a comparison between quantities") if topic == "algebra": if "=" in text: givens.append("An equation is given") vars_found = _extract_variables(text) if vars_found: givens.append(f"A variable appears in the equation: {', '.join(sorted(set(vars_found))[:3])}") if topic == "probability": givens.append("The question describes an event and a total set of possible outcomes") if topic == "statistics": givens.append("The question provides values or a distribution to summarize") if topic == "geometry": givens.append("The question gives shape-based information") if topic == "number_theory": givens.append("The question gives number-property information") numbers = _extract_numbers(text) if numbers: givens.append(f"Numbers mentioned: {', '.join(numbers[:5])}") return givens def _extract_constraints(text: str) -> List[str]: constraints: List[str] = [] lower = text.lower() phrases = [ "integer", "positive", "negative", "nonnegative", "distinct", "consecutive", "even", "odd", "at least", "at most", "greater than", "less than", "multiple choice", ] for p in phrases: if p in lower: constraints.append(p) return constraints def _extract_relationships(text: str, topic: Optional[str]) -> List[str]: rel: List[str] = [] lower = text.lower() if topic == "percent": rel.append("This is a part-percent-whole relationship") if "of" in lower: rel.append("A quantity is being described as a percent of another quantity") elif topic == "ratio": rel.append("This compares quantities in a fixed proportion") if "total" in lower: rel.append("You may need to connect the ratio parts to a total") elif topic == "algebra": rel.append("The equal sign means both sides represent the same value") rel.append("You need to isolate the variable while keeping the equation balanced") elif topic == "probability": rel.append("Probability compares favorable outcomes to total possible outcomes") elif topic == "statistics": rel.append("You need to match the question to the correct summary measure") elif topic == "geometry": rel.append("The quantities are linked by properties of the figure") elif topic == "number_theory": rel.append("The solution depends on a number rule such as divisibility or factors") return rel def _needed_concepts(topic: Optional[str]) -> List[str]: if topic == "percent": return ["percent equation", "part-whole thinking"] if topic == "ratio": return ["ratio structure", "part-to-part or part-to-whole setup"] if topic == "algebra": return ["equation balancing", "inverse operations"] if topic == "probability": return ["favorable outcomes", "total outcomes"] if topic == "statistics": return ["identify the correct statistic", "use the relevant values only"] if topic == "geometry": return ["figure properties", "spatial relationships"] if topic == "number_theory": return ["divisibility/factor rules", "integer properties"] return [] def _trap_notes(topic: Optional[str], text: str) -> List[str]: traps: List[str] = [] lower = text.lower() if topic == "percent": traps.append("Do not confuse the part with the whole") traps.append("Check whether you are solving forward or backward") elif topic == "ratio": traps.append("Do not add or compare ratio parts inconsistently") traps.append("Check whether the ratio is part-to-part or part-to-whole") elif topic == "algebra": traps.append("Do not perform an operation on only one side of the equation") traps.append("Watch for distribution or sign mistakes") elif topic == "probability": traps.append("Do not forget the total number of possible outcomes") traps.append("Check whether order matters") elif topic == "statistics": traps.append("Do not use the wrong measure") traps.append("Check whether outliers matter") elif topic == "geometry": traps.append("Do not assume a figure is drawn to scale unless stated") traps.append("Use only relationships actually given") elif topic == "number_theory": traps.append("Check the exact divisibility or remainder condition") traps.append("Do not assume every integer behaves the same way") if "except" in lower: traps.append("Watch for exception wording") return traps def parse_question(text: str) -> ParsedQuestion: normalized = _normalize_text(text) lower = normalized.lower() topic = _detect_topic(lower) return ParsedQuestion( raw_text=text, normalized_text=normalized, topic=topic, asks_for=_detect_asks_for(normalized, topic), givens=_extract_givens(normalized, topic), constraints=_extract_constraints(normalized), relationships=_extract_relationships(normalized, topic), needed_concepts=_needed_concepts(topic), trap_notes=_trap_notes(topic, normalized), numbers=_extract_numbers(normalized), variables=_extract_variables(normalized), has_percent=("%" in lower or "percent" in lower or "percentage" in lower), has_ratio=("ratio" in lower or "proportion" in lower or bool(re.search(r"\b\d+\s*:\s*\d+\b", lower))), has_equation=("=" in lower), has_probability=any(w in lower for w in ["probability", "chance", "odds", "randomly"]), has_geometry=any(w in lower for w in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]), has_statistics=any(w in lower for w in ["mean", "median", "average", "mode", "standard deviation", "range"]), has_number_properties=any(w in lower for w in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]), )