Spaces:

j-js
/

GameAI

Running

App Files Files Community

j-js commited on 30 days ago

Commit

0e143c5

verified ·

1 Parent(s): 6830b8b

Create question_parser.py

Browse files

Files changed (1) hide show

question_parser.py +299 -0

question_parser.py ADDED Viewed

	@@ -0,0 +1,299 @@

+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import List, Optional
+@dataclass
+class ParsedQuestion:
+    raw_text: str
+    normalized_text: str
+    topic: Optional[str] = None
+    asks_for: Optional[str] = None
+    givens: List[str] = field(default_factory=list)
+    constraints: List[str] = field(default_factory=list)
+    relationships: List[str] = field(default_factory=list)
+    needed_concepts: List[str] = field(default_factory=list)
+    trap_notes: List[str] = field(default_factory=list)
+    numbers: List[str] = field(default_factory=list)
+    variables: List[str] = field(default_factory=list)
+    has_percent: bool = False
+    has_ratio: bool = False
+    has_equation: bool = False
+    has_probability: bool = False
+    has_geometry: bool = False
+    has_statistics: bool = False
+    has_number_properties: bool = False
+def _normalize_text(text: str) -> str:
+    text = (text or "").strip()
+    text = text.replace("’", "'").replace("“", '"').replace("”", '"')
+    text = re.sub(r"\s+", " ", text)
+    return text
+def _extract_numbers(text: str) -> List[str]:
+    return re.findall(r"\b\d+(?:\.\d+)?%?\b", text)
+def _extract_variables(text: str) -> List[str]:
+    vars_found = re.findall(r"\b[a-z]\b", text.lower())
+    common_noise = {"a", "i"}
+    return [v for v in vars_found if v not in common_noise]
+def _detect_topic(t: str) -> Optional[str]:
+    if "%" in t or "percent" in t or "percentage" in t:
+        return "percent"
+    if "ratio" in t or "proportion" in t or re.search(r"\b\d+\s*:\s*\d+\b", t):
+        return "ratio"
+    if any(word in t for word in ["probability", "chance", "odds", "randomly"]):
+        return "probability"
+    if any(word in t for word in ["mean", "median", "average", "mode", "standard deviation", "range"]):
+        return "statistics"
+    if any(word in t for word in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]):
+        return "geometry"
+    if any(word in t for word in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]):
+        return "number_theory"
+    if "=" in t or re.search(r"\bsolve for\b", t) or re.search(r"\bwhat is [a-z]\b", t):
+        return "algebra"
+    return None
+def _detect_asks_for(t: str, topic: Optional[str]) -> Optional[str]:
+    lower = t.lower()
+    m = re.search(r"\bwhat is the value of ([a-z])\b", lower)
+    if m:
+        return f"the value of {m.group(1)}"
+    m = re.search(r"\bsolve for ([a-z])\b", lower)
+    if m:
+        return f"the value of {m.group(1)}"
+    if topic == "percent":
+        if "original" in lower or "whole" in lower:
+            return "the original whole value"
+        if "percent" in lower and "of" in lower:
+            return "the missing value in a percent relationship"
+        return "the unknown quantity in the percent relationship"
+    if topic == "ratio":
+        return "the missing part or total in the ratio relationship"
+    if topic == "probability":
+        return "the probability of the event"
+    if topic == "statistics":
+        return "the requested statistic"
+    if topic == "geometry":
+        return "the missing geometric quantity"
+    if topic == "number_theory":
+        return "the required number property or unknown integer"
+    if topic == "algebra":
+        return "the value of the variable"
+    return "the target quantity asked for in the question"
+def _extract_givens(text: str, topic: Optional[str]) -> List[str]:
+    givens: List[str] = []
+    if topic == "percent":
+        percents = re.findall(r"\b\d+(?:\.\d+)?%", text)
+        if percents:
+            givens.append(f"A percentage is given: {', '.join(percents[:3])}")
+        if "of" in text.lower():
+            givens.append("The wording uses a part-of-whole relationship")
+    if topic == "ratio":
+        ratio_match = re.findall(r"\b\d+\s*:\s*\d+\b", text)
+        if ratio_match:
+            givens.append(f"A ratio is given: {', '.join(ratio_match[:3])}")
+        if "ratio" in text.lower() or "proportion" in text.lower():
+            givens.append("The question involves a comparison between quantities")
+    if topic == "algebra":
+        if "=" in text:
+            givens.append("An equation is given")
+        vars_found = _extract_variables(text)
+        if vars_found:
+            givens.append(f"A variable appears in the equation: {', '.join(sorted(set(vars_found))[:3])}")
+    if topic == "probability":
+        givens.append("The question describes an event and a total set of possible outcomes")
+    if topic == "statistics":
+        givens.append("The question provides values or a distribution to summarize")
+    if topic == "geometry":
+        givens.append("The question gives shape-based information")
+    if topic == "number_theory":
+        givens.append("The question gives number-property information")
+    numbers = _extract_numbers(text)
+    if numbers:
+        givens.append(f"Numbers mentioned: {', '.join(numbers[:5])}")
+    return givens
+def _extract_constraints(text: str) -> List[str]:
+    constraints: List[str] = []
+    lower = text.lower()
+    phrases = [
+        "integer",
+        "positive",
+        "negative",
+        "nonnegative",
+        "distinct",
+        "consecutive",
+        "even",
+        "odd",
+        "at least",
+        "at most",
+        "greater than",
+        "less than",
+        "multiple choice",
+    ]
+    for p in phrases:
+        if p in lower:
+            constraints.append(p)
+    return constraints
+def _extract_relationships(text: str, topic: Optional[str]) -> List[str]:
+    rel: List[str] = []
+    lower = text.lower()
+    if topic == "percent":
+        rel.append("This is a part-percent-whole relationship")
+        if "of" in lower:
+            rel.append("A quantity is being described as a percent of another quantity")
+    elif topic == "ratio":
+        rel.append("This compares quantities in a fixed proportion")
+        if "total" in lower:
+            rel.append("You may need to connect the ratio parts to a total")
+    elif topic == "algebra":
+        rel.append("The equal sign means both sides represent the same value")
+        rel.append("You need to isolate the variable while keeping the equation balanced")
+    elif topic == "probability":
+        rel.append("Probability compares favorable outcomes to total possible outcomes")
+    elif topic == "statistics":
+        rel.append("You need to match the question to the correct summary measure")
+    elif topic == "geometry":
+        rel.append("The quantities are linked by properties of the figure")
+    elif topic == "number_theory":
+        rel.append("The solution depends on a number rule such as divisibility or factors")
+    return rel
+def _needed_concepts(topic: Optional[str]) -> List[str]:
+    if topic == "percent":
+        return ["percent equation", "part-whole thinking"]
+    if topic == "ratio":
+        return ["ratio structure", "part-to-part or part-to-whole setup"]
+    if topic == "algebra":
+        return ["equation balancing", "inverse operations"]
+    if topic == "probability":
+        return ["favorable outcomes", "total outcomes"]
+    if topic == "statistics":
+        return ["identify the correct statistic", "use the relevant values only"]
+    if topic == "geometry":
+        return ["figure properties", "spatial relationships"]
+    if topic == "number_theory":
+        return ["divisibility/factor rules", "integer properties"]
+    return []
+def _trap_notes(topic: Optional[str], text: str) -> List[str]:
+    traps: List[str] = []
+    lower = text.lower()
+    if topic == "percent":
+        traps.append("Do not confuse the part with the whole")
+        traps.append("Check whether you are solving forward or backward")
+    elif topic == "ratio":
+        traps.append("Do not add or compare ratio parts inconsistently")
+        traps.append("Check whether the ratio is part-to-part or part-to-whole")
+    elif topic == "algebra":
+        traps.append("Do not perform an operation on only one side of the equation")
+        traps.append("Watch for distribution or sign mistakes")
+    elif topic == "probability":
+        traps.append("Do not forget the total number of possible outcomes")
+        traps.append("Check whether order matters")
+    elif topic == "statistics":
+        traps.append("Do not use the wrong measure")
+        traps.append("Check whether outliers matter")
+    elif topic == "geometry":
+        traps.append("Do not assume a figure is drawn to scale unless stated")
+        traps.append("Use only relationships actually given")
+    elif topic == "number_theory":
+        traps.append("Check the exact divisibility or remainder condition")
+        traps.append("Do not assume every integer behaves the same way")
+    if "except" in lower:
+        traps.append("Watch for exception wording")
+    return traps
+def parse_question(text: str) -> ParsedQuestion:
+    normalized = _normalize_text(text)
+    lower = normalized.lower()
+    topic = _detect_topic(lower)
+    return ParsedQuestion(
+        raw_text=text,
+        normalized_text=normalized,
+        topic=topic,
+        asks_for=_detect_asks_for(normalized, topic),
+        givens=_extract_givens(normalized, topic),
+        constraints=_extract_constraints(normalized),
+        relationships=_extract_relationships(normalized, topic),
+        needed_concepts=_needed_concepts(topic),
+        trap_notes=_trap_notes(topic, normalized),
+        numbers=_extract_numbers(normalized),
+        variables=_extract_variables(normalized),
+        has_percent=("%" in lower or "percent" in lower or "percentage" in lower),
+        has_ratio=("ratio" in lower or "proportion" in lower or bool(re.search(r"\b\d+\s*:\s*\d+\b", lower))),
+        has_equation=("=" in lower),
+        has_probability=any(w in lower for w in ["probability", "chance", "odds", "randomly"]),
+        has_geometry=any(w in lower for w in ["triangle", "circle", "angle", "area", "perimeter", "radius", "diameter"]),
+        has_statistics=any(w in lower for w in ["mean", "median", "average", "mode", "standard deviation", "range"]),
+        has_number_properties=any(w in lower for w in ["remainder", "divisible", "factor", "multiple", "prime", "integer", "even", "odd"]),
+    )