Spaces:

j-js
/

GameAI

Sleeping

App Files Files Community

GameAI / question_parser.py

j-js

Create question_parser.py

0e143c5 verified 21 days ago

raw

history blame contribute delete

10.3 kB

	from __future__ import annotations

	import re
	from dataclasses import dataclass, field
	from typing import List, Optional


	@dataclass
	class ParsedQuestion:
	raw_text: str
	normalized_text: str

	topic: Optional[str] = None
	asks_for: Optional[str] = None

	givens: List[str] = field(default_factory=list)
	constraints: List[str] = field(default_factory=list)
	relationships: List[str] = field(default_factory=list)
	needed_concepts: List[str] = field(default_factory=list)
	trap_notes: List[str] = field(default_factory=list)

	numbers: List[str] = field(default_factory=list)
	variables: List[str] = field(default_factory=list)

	has_percent: bool = False
	has_ratio: bool = False
	has_equation: bool = False
	has_probability: bool = False
	has_geometry: bool = False
	has_statistics: bool = False
	has_number_properties: bool = False


	def _normalize_text(text: str) -> str:
	text = (text or "").strip()
	text = text.replace("’", "'").replace("“", '"').replace("”", '"')
	text = re.sub(r"\s+", " ", text)
	return text


	def _extract_numbers(text: str) -> List[str]:
	return re.findall(r"\b\d+(?:\.\d+)?%?\b", text)


	def _extract_variables(text: str) -> List[str]:
	vars_found = re.findall(r"\b[a-z]\b", text.lower())
	common_noise = {"a", "i"}
	return [v for v in vars_found if v not in common_noise]


	def _detect_topic(t: str) -> Optional[str]:
	if "%" in t or "percent" in t or "percentage" in t:
	return "percent"

	if "ratio" in t or "proportion" in t or re.search(r"\b\d+\s:\s\d+\b", t):
	return "ratio"

	if any(word in t for word in ["probability", "chance", "odds", "randomly"]):
	return "probability"

	if any(word in t for word in ["mean", "median", "average", "mode", "standard deviation", "range"]):
	return "statistics"

	if any(word in t for word in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]):
	return "geometry"

	if any(word in t for word in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]):
	return "number_theory"

	if "=" in t or re.search(r"\bsolve for\b", t) or re.search(r"\bwhat is [a-z]\b", t):
	return "algebra"

	return None


	def _detect_asks_for(t: str, topic: Optional[str]) -> Optional[str]:
	lower = t.lower()

	m = re.search(r"\bwhat is the value of ([a-z])\b", lower)
	if m:
	return f"the value of {m.group(1)}"

	m = re.search(r"\bsolve for ([a-z])\b", lower)
	if m:
	return f"the value of {m.group(1)}"

	if topic == "percent":
	if "original" in lower or "whole" in lower:
	return "the original whole value"
	if "percent" in lower and "of" in lower:
	return "the missing value in a percent relationship"
	return "the unknown quantity in the percent relationship"

	if topic == "ratio":
	return "the missing part or total in the ratio relationship"

	if topic == "probability":
	return "the probability of the event"

	if topic == "statistics":
	return "the requested statistic"

	if topic == "geometry":
	return "the missing geometric quantity"

	if topic == "number_theory":
	return "the required number property or unknown integer"

	if topic == "algebra":
	return "the value of the variable"

	return "the target quantity asked for in the question"


	def _extract_givens(text: str, topic: Optional[str]) -> List[str]:
	givens: List[str] = []

	if topic == "percent":
	percents = re.findall(r"\b\d+(?:\.\d+)?%", text)
	if percents:
	givens.append(f"A percentage is given: {', '.join(percents[:3])}")
	if "of" in text.lower():
	givens.append("The wording uses a part-of-whole relationship")

	if topic == "ratio":
	ratio_match = re.findall(r"\b\d+\s:\s\d+\b", text)
	if ratio_match:
	givens.append(f"A ratio is given: {', '.join(ratio_match[:3])}")
	if "ratio" in text.lower() or "proportion" in text.lower():
	givens.append("The question involves a comparison between quantities")

	if topic == "algebra":
	if "=" in text:
	givens.append("An equation is given")
	vars_found = _extract_variables(text)
	if vars_found:
	givens.append(f"A variable appears in the equation: {', '.join(sorted(set(vars_found))[:3])}")

	if topic == "probability":
	givens.append("The question describes an event and a total set of possible outcomes")

	if topic == "statistics":
	givens.append("The question provides values or a distribution to summarize")

	if topic == "geometry":
	givens.append("The question gives shape-based information")

	if topic == "number_theory":
	givens.append("The question gives number-property information")

	numbers = _extract_numbers(text)
	if numbers:
	givens.append(f"Numbers mentioned: {', '.join(numbers[:5])}")

	return givens


	def _extract_constraints(text: str) -> List[str]:
	constraints: List[str] = []
	lower = text.lower()

	phrases = [
	"integer",
	"positive",
	"negative",
	"nonnegative",
	"distinct",
	"consecutive",
	"even",
	"odd",
	"at least",
	"at most",
	"greater than",
	"less than",
	"multiple choice",
	]

	for p in phrases:
	if p in lower:
	constraints.append(p)

	return constraints


	def _extract_relationships(text: str, topic: Optional[str]) -> List[str]:
	rel: List[str] = []
	lower = text.lower()

	if topic == "percent":
	rel.append("This is a part-percent-whole relationship")
	if "of" in lower:
	rel.append("A quantity is being described as a percent of another quantity")

	elif topic == "ratio":
	rel.append("This compares quantities in a fixed proportion")
	if "total" in lower:
	rel.append("You may need to connect the ratio parts to a total")

	elif topic == "algebra":
	rel.append("The equal sign means both sides represent the same value")
	rel.append("You need to isolate the variable while keeping the equation balanced")

	elif topic == "probability":
	rel.append("Probability compares favorable outcomes to total possible outcomes")

	elif topic == "statistics":
	rel.append("You need to match the question to the correct summary measure")

	elif topic == "geometry":
	rel.append("The quantities are linked by properties of the figure")

	elif topic == "number_theory":
	rel.append("The solution depends on a number rule such as divisibility or factors")

	return rel


	def _needed_concepts(topic: Optional[str]) -> List[str]:
	if topic == "percent":
	return ["percent equation", "part-whole thinking"]
	if topic == "ratio":
	return ["ratio structure", "part-to-part or part-to-whole setup"]
	if topic == "algebra":
	return ["equation balancing", "inverse operations"]
	if topic == "probability":
	return ["favorable outcomes", "total outcomes"]
	if topic == "statistics":
	return ["identify the correct statistic", "use the relevant values only"]
	if topic == "geometry":
	return ["figure properties", "spatial relationships"]
	if topic == "number_theory":
	return ["divisibility/factor rules", "integer properties"]
	return []


	def _trap_notes(topic: Optional[str], text: str) -> List[str]:
	traps: List[str] = []
	lower = text.lower()

	if topic == "percent":
	traps.append("Do not confuse the part with the whole")
	traps.append("Check whether you are solving forward or backward")

	elif topic == "ratio":
	traps.append("Do not add or compare ratio parts inconsistently")
	traps.append("Check whether the ratio is part-to-part or part-to-whole")

	elif topic == "algebra":
	traps.append("Do not perform an operation on only one side of the equation")
	traps.append("Watch for distribution or sign mistakes")

	elif topic == "probability":
	traps.append("Do not forget the total number of possible outcomes")
	traps.append("Check whether order matters")

	elif topic == "statistics":
	traps.append("Do not use the wrong measure")
	traps.append("Check whether outliers matter")

	elif topic == "geometry":
	traps.append("Do not assume a figure is drawn to scale unless stated")
	traps.append("Use only relationships actually given")

	elif topic == "number_theory":
	traps.append("Check the exact divisibility or remainder condition")
	traps.append("Do not assume every integer behaves the same way")

	if "except" in lower:
	traps.append("Watch for exception wording")

	return traps


	def parse_question(text: str) -> ParsedQuestion:
	normalized = _normalize_text(text)
	lower = normalized.lower()

	topic = _detect_topic(lower)

	return ParsedQuestion(
	raw_text=text,
	normalized_text=normalized,
	topic=topic,
	asks_for=_detect_asks_for(normalized, topic),
	givens=_extract_givens(normalized, topic),
	constraints=_extract_constraints(normalized),
	relationships=_extract_relationships(normalized, topic),
	needed_concepts=_needed_concepts(topic),
	trap_notes=_trap_notes(topic, normalized),
	numbers=_extract_numbers(normalized),
	variables=_extract_variables(normalized),
	has_percent=("%" in lower or "percent" in lower or "percentage" in lower),
	has_ratio=("ratio" in lower or "proportion" in lower or bool(re.search(r"\b\d+\s:\s\d+\b", lower))),
	has_equation=("=" in lower),
	has_probability=any(w in lower for w in ["probability", "chance", "odds", "randomly"]),
	has_geometry=any(w in lower for w in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]),
	has_statistics=any(w in lower for w in ["mean", "median", "average", "mode", "standard deviation", "range"]),
	has_number_properties=any(w in lower for w in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]),
	)