GameAI / question_parser.py
j-js's picture
Create question_parser.py
0e143c5 verified
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import List, Optional
@dataclass
class ParsedQuestion:
raw_text: str
normalized_text: str
topic: Optional[str] = None
asks_for: Optional[str] = None
givens: List[str] = field(default_factory=list)
constraints: List[str] = field(default_factory=list)
relationships: List[str] = field(default_factory=list)
needed_concepts: List[str] = field(default_factory=list)
trap_notes: List[str] = field(default_factory=list)
numbers: List[str] = field(default_factory=list)
variables: List[str] = field(default_factory=list)
has_percent: bool = False
has_ratio: bool = False
has_equation: bool = False
has_probability: bool = False
has_geometry: bool = False
has_statistics: bool = False
has_number_properties: bool = False
def _normalize_text(text: str) -> str:
text = (text or "").strip()
text = text.replace("’", "'").replace("“", '"').replace("”", '"')
text = re.sub(r"\s+", " ", text)
return text
def _extract_numbers(text: str) -> List[str]:
return re.findall(r"\b\d+(?:\.\d+)?%?\b", text)
def _extract_variables(text: str) -> List[str]:
vars_found = re.findall(r"\b[a-z]\b", text.lower())
common_noise = {"a", "i"}
return [v for v in vars_found if v not in common_noise]
def _detect_topic(t: str) -> Optional[str]:
if "%" in t or "percent" in t or "percentage" in t:
return "percent"
if "ratio" in t or "proportion" in t or re.search(r"\b\d+\s*:\s*\d+\b", t):
return "ratio"
if any(word in t for word in ["probability", "chance", "odds", "randomly"]):
return "probability"
if any(word in t for word in ["mean", "median", "average", "mode", "standard deviation", "range"]):
return "statistics"
if any(word in t for word in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]):
return "geometry"
if any(word in t for word in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]):
return "number_theory"
if "=" in t or re.search(r"\bsolve for\b", t) or re.search(r"\bwhat is [a-z]\b", t):
return "algebra"
return None
def _detect_asks_for(t: str, topic: Optional[str]) -> Optional[str]:
lower = t.lower()
m = re.search(r"\bwhat is the value of ([a-z])\b", lower)
if m:
return f"the value of {m.group(1)}"
m = re.search(r"\bsolve for ([a-z])\b", lower)
if m:
return f"the value of {m.group(1)}"
if topic == "percent":
if "original" in lower or "whole" in lower:
return "the original whole value"
if "percent" in lower and "of" in lower:
return "the missing value in a percent relationship"
return "the unknown quantity in the percent relationship"
if topic == "ratio":
return "the missing part or total in the ratio relationship"
if topic == "probability":
return "the probability of the event"
if topic == "statistics":
return "the requested statistic"
if topic == "geometry":
return "the missing geometric quantity"
if topic == "number_theory":
return "the required number property or unknown integer"
if topic == "algebra":
return "the value of the variable"
return "the target quantity asked for in the question"
def _extract_givens(text: str, topic: Optional[str]) -> List[str]:
givens: List[str] = []
if topic == "percent":
percents = re.findall(r"\b\d+(?:\.\d+)?%", text)
if percents:
givens.append(f"A percentage is given: {', '.join(percents[:3])}")
if "of" in text.lower():
givens.append("The wording uses a part-of-whole relationship")
if topic == "ratio":
ratio_match = re.findall(r"\b\d+\s*:\s*\d+\b", text)
if ratio_match:
givens.append(f"A ratio is given: {', '.join(ratio_match[:3])}")
if "ratio" in text.lower() or "proportion" in text.lower():
givens.append("The question involves a comparison between quantities")
if topic == "algebra":
if "=" in text:
givens.append("An equation is given")
vars_found = _extract_variables(text)
if vars_found:
givens.append(f"A variable appears in the equation: {', '.join(sorted(set(vars_found))[:3])}")
if topic == "probability":
givens.append("The question describes an event and a total set of possible outcomes")
if topic == "statistics":
givens.append("The question provides values or a distribution to summarize")
if topic == "geometry":
givens.append("The question gives shape-based information")
if topic == "number_theory":
givens.append("The question gives number-property information")
numbers = _extract_numbers(text)
if numbers:
givens.append(f"Numbers mentioned: {', '.join(numbers[:5])}")
return givens
def _extract_constraints(text: str) -> List[str]:
constraints: List[str] = []
lower = text.lower()
phrases = [
"integer",
"positive",
"negative",
"nonnegative",
"distinct",
"consecutive",
"even",
"odd",
"at least",
"at most",
"greater than",
"less than",
"multiple choice",
]
for p in phrases:
if p in lower:
constraints.append(p)
return constraints
def _extract_relationships(text: str, topic: Optional[str]) -> List[str]:
rel: List[str] = []
lower = text.lower()
if topic == "percent":
rel.append("This is a part-percent-whole relationship")
if "of" in lower:
rel.append("A quantity is being described as a percent of another quantity")
elif topic == "ratio":
rel.append("This compares quantities in a fixed proportion")
if "total" in lower:
rel.append("You may need to connect the ratio parts to a total")
elif topic == "algebra":
rel.append("The equal sign means both sides represent the same value")
rel.append("You need to isolate the variable while keeping the equation balanced")
elif topic == "probability":
rel.append("Probability compares favorable outcomes to total possible outcomes")
elif topic == "statistics":
rel.append("You need to match the question to the correct summary measure")
elif topic == "geometry":
rel.append("The quantities are linked by properties of the figure")
elif topic == "number_theory":
rel.append("The solution depends on a number rule such as divisibility or factors")
return rel
def _needed_concepts(topic: Optional[str]) -> List[str]:
if topic == "percent":
return ["percent equation", "part-whole thinking"]
if topic == "ratio":
return ["ratio structure", "part-to-part or part-to-whole setup"]
if topic == "algebra":
return ["equation balancing", "inverse operations"]
if topic == "probability":
return ["favorable outcomes", "total outcomes"]
if topic == "statistics":
return ["identify the correct statistic", "use the relevant values only"]
if topic == "geometry":
return ["figure properties", "spatial relationships"]
if topic == "number_theory":
return ["divisibility/factor rules", "integer properties"]
return []
def _trap_notes(topic: Optional[str], text: str) -> List[str]:
traps: List[str] = []
lower = text.lower()
if topic == "percent":
traps.append("Do not confuse the part with the whole")
traps.append("Check whether you are solving forward or backward")
elif topic == "ratio":
traps.append("Do not add or compare ratio parts inconsistently")
traps.append("Check whether the ratio is part-to-part or part-to-whole")
elif topic == "algebra":
traps.append("Do not perform an operation on only one side of the equation")
traps.append("Watch for distribution or sign mistakes")
elif topic == "probability":
traps.append("Do not forget the total number of possible outcomes")
traps.append("Check whether order matters")
elif topic == "statistics":
traps.append("Do not use the wrong measure")
traps.append("Check whether outliers matter")
elif topic == "geometry":
traps.append("Do not assume a figure is drawn to scale unless stated")
traps.append("Use only relationships actually given")
elif topic == "number_theory":
traps.append("Check the exact divisibility or remainder condition")
traps.append("Do not assume every integer behaves the same way")
if "except" in lower:
traps.append("Watch for exception wording")
return traps
def parse_question(text: str) -> ParsedQuestion:
normalized = _normalize_text(text)
lower = normalized.lower()
topic = _detect_topic(lower)
return ParsedQuestion(
raw_text=text,
normalized_text=normalized,
topic=topic,
asks_for=_detect_asks_for(normalized, topic),
givens=_extract_givens(normalized, topic),
constraints=_extract_constraints(normalized),
relationships=_extract_relationships(normalized, topic),
needed_concepts=_needed_concepts(topic),
trap_notes=_trap_notes(topic, normalized),
numbers=_extract_numbers(normalized),
variables=_extract_variables(normalized),
has_percent=("%" in lower or "percent" in lower or "percentage" in lower),
has_ratio=("ratio" in lower or "proportion" in lower or bool(re.search(r"\b\d+\s*:\s*\d+\b", lower))),
has_equation=("=" in lower),
has_probability=any(w in lower for w in ["probability", "chance", "odds", "randomly"]),
has_geometry=any(w in lower for w in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]),
has_statistics=any(w in lower for w in ["mean", "median", "average", "mode", "standard deviation", "range"]),
has_number_properties=any(w in lower for w in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]),
)