| from __future__ import annotations |
|
|
| import math |
| import re |
| from statistics import pstdev |
| from typing import Optional, List, Tuple |
|
|
| from models import SolverResult |
|
|
|
|
| |
| |
| |
|
|
| _NUMBER_RE = r"-?\d+(?:\.\d+)?" |
|
|
| _STD_PHRASES = [ |
| "standard deviation", |
| "std dev", |
| "std. dev", |
| "stdev", |
| "sd ", |
| " s.d.", |
| ] |
|
|
| _COMPARE_WORDS = [ |
| "greater", |
| "larger", |
| "higher", |
| "smaller", |
| "lower", |
| "less", |
| "same", |
| "equal", |
| "compare", |
| "comparison", |
| ] |
|
|
| _SET_LABEL_RE = re.compile( |
| rf""" |
| (?: |
| \b([A-Z])\b\s*[:=]\s* # A: 1,2,3 |
| | |
| \bset\s+([A-Z])\b\s*[:=]?\s* # Set A: 1,2,3 |
| | |
| \bgroup\s+([A-Z])\b\s*[:=]?\s* # Group A: 1,2,3 |
| ) |
| ([^\n;|]+) |
| """, |
| re.IGNORECASE | re.VERBOSE, |
| ) |
|
|
|
|
| def _clean(text: str) -> str: |
| return re.sub(r"\s+", " ", (text or "").strip().lower()) |
|
|
|
|
| def _nums(text: str) -> List[float]: |
| return [float(x) for x in re.findall(_NUMBER_RE, text)] |
|
|
|
|
| def _is_close(a: float, b: float, tol: float = 1e-9) -> bool: |
| return abs(a - b) <= tol |
|
|
|
|
| def _all_equal(vals: List[float]) -> bool: |
| return bool(vals) and all(_is_close(v, vals[0]) for v in vals) |
|
|
|
|
| def _mean(vals: List[float]) -> float: |
| return sum(vals) / len(vals) |
|
|
|
|
| def _spread_score(vals: List[float]) -> float: |
| """ |
| Cheap comparison proxy for spread. For same-length sets, |
| pstdev is best, but this helper can still support quick comparisons. |
| """ |
| if not vals: |
| return 0.0 |
| return pstdev(vals) |
|
|
|
|
| def _safe_number_text(x: float) -> str: |
| if _is_close(x, round(x)): |
| return str(int(round(x))) |
| return f"{x:.6g}" |
|
|
|
|
| def _mentions_standard_deviation(lower: str) -> bool: |
| return any(p in lower for p in _STD_PHRASES) |
|
|
|
|
| def _mentions_variability(lower: str) -> bool: |
| return any( |
| p in lower |
| for p in [ |
| "spread", |
| "more spread out", |
| "less spread out", |
| "dispersion", |
| "variability", |
| "variation", |
| ] |
| ) |
|
|
|
|
| def _extract_labeled_sets(text: str) -> List[Tuple[str, List[float]]]: |
| sets: List[Tuple[str, List[float]]] = [] |
| for m in _SET_LABEL_RE.finditer(text): |
| label = (m.group(1) or m.group(2) or "").upper() |
| body = m.group(3) |
| nums = _nums(body) |
| if len(nums) >= 2: |
| sets.append((label, nums)) |
| return sets |
|
|
|
|
| def _extract_braced_sets(text: str) -> List[List[float]]: |
| groups = re.findall(r"\{([^{}]+)\}|\(([^()]+)\)|\[([^\[\]]+)\]", text) |
| out: List[List[float]] = [] |
| for g in groups: |
| body = next((part for part in g if part), "") |
| nums = _nums(body) |
| if len(nums) >= 2: |
| out.append(nums) |
| return out |
|
|
|
|
| def _describe_shift_rule() -> List[str]: |
| return [ |
| "Adding or subtracting the same constant shifts every value equally.", |
| "That changes the center, but not the spread.", |
| "So the standard deviation stays unchanged.", |
| ] |
|
|
|
|
| def _describe_scale_rule(factor: float) -> List[str]: |
| return [ |
| "Multiplying or dividing every value rescales every distance from the mean by the same factor.", |
| f"So the standard deviation is multiplied by |{_safe_number_text(factor)}|.", |
| "The key idea is that spread scales with the absolute value of the multiplier.", |
| ] |
|
|
|
|
| def _build_result( |
| *, |
| solved: bool, |
| internal_answer: Optional[str], |
| steps: List[str], |
| answer_value: Optional[str] = None, |
| ) -> SolverResult: |
| |
| return SolverResult( |
| domain="quant", |
| solved=solved, |
| topic="standard_deviation", |
| answer_value=answer_value if answer_value is not None else "computed internally", |
| internal_answer=internal_answer, |
| steps=steps, |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def _detect_add_sub_constant(lower: str) -> bool: |
| return any( |
| p in lower |
| for p in [ |
| "add the same", |
| "added the same", |
| "increased by the same", |
| "decreased by the same", |
| "plus a constant", |
| "minus a constant", |
| "subtract the same", |
| "subtracted the same", |
| "add 5 to every", |
| "subtract 5 from every", |
| "each value is increased by", |
| "each value is decreased by", |
| "every value is increased by", |
| "every value is decreased by", |
| ] |
| ) |
|
|
|
|
| def _detect_scaling(lower: str) -> Optional[float]: |
| patterns = [ |
| r"(?:multiplied by|scaled by|times)\s*(" + _NUMBER_RE + r")", |
| r"(?:each|every)\s+value\s+(?:is\s+)?multiplied\s+by\s*(" + _NUMBER_RE + r")", |
| r"(?:each|every)\s+value\s+(?:is\s+)?divided\s+by\s*(" + _NUMBER_RE + r")", |
| ] |
|
|
| for pat in patterns: |
| m = re.search(pat, lower) |
| if m: |
| val = float(m.group(1)) |
| if "divided by" in m.group(0): |
| if not _is_close(val, 0.0): |
| return 1.0 / val |
| return val |
|
|
| |
| m = re.search(r"(increase|decrease)\s+by\s+(\d+(?:\.\d+)?)\s*percent", lower) |
| if m: |
| pct = float(m.group(2)) / 100.0 |
| if m.group(1) == "increase": |
| return 1.0 + pct |
| return 1.0 - pct |
|
|
| return None |
|
|
|
|
| def _detect_zero_sd_prompt(lower: str) -> bool: |
| return any( |
| p in lower |
| for p in [ |
| "standard deviation is 0", |
| "std dev is 0", |
| "zero standard deviation", |
| "when is the standard deviation zero", |
| ] |
| ) |
|
|
|
|
| def _detect_outlier_prompt(lower: str) -> bool: |
| return "outlier" in lower or "extreme value" in lower |
|
|
|
|
| def _detect_same_mean_diff_spread(lower: str) -> bool: |
| return ( |
| ("same mean" in lower or "equal mean" in lower) |
| and any(p in lower for p in ["more spread", "less spread", "farther from the mean", "closer to the mean"]) |
| ) |
|
|
|
|
| def _detect_compare_sets(lower: str) -> bool: |
| return any(w in lower for w in _COMPARE_WORDS) and ( |
| "set" in lower or "group" in lower or "list" in lower or "data set" in lower |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def _solve_conceptual_constant_shift(lower: str) -> Optional[SolverResult]: |
| if not _detect_add_sub_constant(lower): |
| return None |
|
|
| return _build_result( |
| solved=True, |
| answer_value="unchanged", |
| internal_answer="unchanged", |
| steps=_describe_shift_rule(), |
| ) |
|
|
|
|
| def _solve_conceptual_scaling(lower: str) -> Optional[SolverResult]: |
| factor = _detect_scaling(lower) |
| if factor is None: |
| return None |
|
|
| return _build_result( |
| solved=True, |
| answer_value=f"scaled by |{_safe_number_text(factor)}|", |
| internal_answer=f"scaled by |{_safe_number_text(factor)}|", |
| steps=_describe_scale_rule(factor), |
| ) |
|
|
|
|
| def _solve_zero_standard_deviation(lower: str, nums: List[float]) -> Optional[SolverResult]: |
| if nums and _all_equal(nums): |
| return _build_result( |
| solved=True, |
| answer_value="zero", |
| internal_answer="0", |
| steps=[ |
| "All values are identical, so every value is exactly at the mean.", |
| "That means every deviation from the mean is 0.", |
| "So the standard deviation is 0.", |
| ], |
| ) |
|
|
| if _detect_zero_sd_prompt(lower): |
| return _build_result( |
| solved=True, |
| answer_value="all values equal", |
| internal_answer="standard deviation is zero exactly when all values are equal", |
| steps=[ |
| "Standard deviation measures how far values are from the mean.", |
| "It is zero only when every value has zero distance from the mean.", |
| "That happens exactly when all values are the same.", |
| ], |
| ) |
|
|
| return None |
|
|
|
|
| def _solve_outlier_concept(lower: str) -> Optional[SolverResult]: |
| if not _detect_outlier_prompt(lower): |
| return None |
|
|
| return _build_result( |
| solved=True, |
| answer_value="typically increases", |
| internal_answer="adding or making an outlier more extreme typically increases standard deviation", |
| steps=[ |
| "Standard deviation increases when values lie farther from the mean.", |
| "An outlier is an unusually distant value, so it usually increases spread.", |
| "So introducing a more extreme outlier typically increases the standard deviation.", |
| ], |
| ) |
|
|
|
|
| def _solve_labeled_set_comparison(text: str, lower: str) -> Optional[SolverResult]: |
| sets = _extract_labeled_sets(text) |
|
|
| if len(sets) < 2: |
| return None |
| if not (_detect_compare_sets(lower) or _mentions_standard_deviation(lower) or _mentions_variability(lower)): |
| return None |
|
|
| scored = [(label, vals, _spread_score(vals)) for label, vals in sets] |
| scored_sorted = sorted(scored, key=lambda t: t[2]) |
|
|
| smallest = scored_sorted[0] |
| largest = scored_sorted[-1] |
|
|
| if _is_close(smallest[2], largest[2]): |
| answer = "equal" |
| internal = "equal standard deviation" |
| steps = [ |
| "Compare how far each set’s values lie from its own mean.", |
| "After measuring the spreads, the sets have equal spread.", |
| "So their standard deviations are equal.", |
| ] |
| else: |
| wants_small = any(w in lower for w in ["smaller", "lower", "less"]) |
| chosen = smallest if wants_small else largest |
| answer = chosen[0] |
| internal = chosen[0] |
| steps = [ |
| "For comparison questions, focus on spread rather than just the mean.", |
| "The set whose values sit farther from its mean has the larger standard deviation.", |
| f"Internal comparison identifies set {chosen[0]} as the correct choice.", |
| ] |
|
|
| return _build_result( |
| solved=True, |
| answer_value=answer, |
| internal_answer=internal, |
| steps=steps, |
| ) |
|
|
|
|
| def _solve_braced_set_comparison(text: str, lower: str) -> Optional[SolverResult]: |
| sets = _extract_braced_sets(text) |
| if len(sets) != 2: |
| return None |
| if not (_detect_compare_sets(lower) or "which" in lower): |
| return None |
|
|
| s1 = _spread_score(sets[0]) |
| s2 = _spread_score(sets[1]) |
|
|
| if _is_close(s1, s2): |
| answer = "equal" |
| internal = "equal standard deviation" |
| else: |
| wants_small = any(w in lower for w in ["smaller", "lower", "less"]) |
| if wants_small: |
| answer = "first set" if s1 < s2 else "second set" |
| internal = answer |
| else: |
| answer = "first set" if s1 > s2 else "second set" |
| internal = answer |
|
|
| return _build_result( |
| solved=True, |
| answer_value=answer, |
| internal_answer=internal, |
| steps=[ |
| "Compare distance from each set’s mean, not just the raw values.", |
| "The more spread-out set has the larger standard deviation.", |
| "The choice above is determined internally from that spread comparison.", |
| ], |
| ) |
|
|
|
|
| def _solve_same_mean_spread_concept(lower: str) -> Optional[SolverResult]: |
| if not _detect_same_mean_diff_spread(lower): |
| return None |
|
|
| return _build_result( |
| solved=True, |
| answer_value="the more spread-out set", |
| internal_answer="with same mean, the more spread-out set has larger standard deviation", |
| steps=[ |
| "If two sets have the same mean, standard deviation depends on how far values sit from that mean.", |
| "Values farther from the mean create larger deviations.", |
| "So the more spread-out set has the larger standard deviation.", |
| ], |
| ) |
|
|
|
|
| def _solve_symmetric_spacing_concept(text: str, lower: str) -> Optional[SolverResult]: |
| |
| |
| if "equally spaced" not in lower and "symmetric" not in lower and "centered at" not in lower: |
| return None |
|
|
| nums = _nums(text) |
| if len(nums) < 3: |
| return None |
|
|
| return _build_result( |
| solved=True, |
| answer_value="greater spacing means greater SD", |
| internal_answer="for symmetric equally spaced sets, larger common distance from center means larger SD", |
| steps=[ |
| "For symmetric sets, the mean is the center point.", |
| "Standard deviation is driven by how far the outer values are from that center.", |
| "So if one set has larger equal spacing from the center, it has the larger standard deviation.", |
| ], |
| ) |
|
|
|
|
| def _solve_direct_numeric(nums: List[float], lower: str) -> Optional[SolverResult]: |
| if len(nums) < 2: |
| return None |
|
|
| |
| if _detect_add_sub_constant(lower) or _detect_scaling(lower) is not None: |
| return None |
|
|
| sd = pstdev(nums) |
|
|
| return _build_result( |
| solved=True, |
| answer_value="computed internally", |
| internal_answer=_safe_number_text(sd), |
| steps=[ |
| "Find the mean of the data set.", |
| "Measure each value’s distance from the mean and square those distances.", |
| "Average those squared deviations, then take the square root.", |
| "The exact numeric standard deviation has been computed internally.", |
| ], |
| ) |
|
|
|
|
| |
| |
| |
|
|
| def solve_standard_deviation(text: str) -> Optional[SolverResult]: |
| lower = _clean(text) |
|
|
| if not ( |
| _mentions_standard_deviation(lower) |
| or _mentions_variability(lower) |
| or "variance" in lower |
| or "outlier" in lower |
| ): |
| return None |
|
|
| nums = _nums(text) |
|
|
| |
| for block in ( |
| _solve_conceptual_constant_shift, |
| _solve_conceptual_scaling, |
| ): |
| result = block(lower) |
| if result is not None: |
| return result |
|
|
| |
| result = _solve_zero_standard_deviation(lower, nums) |
| if result is not None: |
| return result |
|
|
| |
| result = _solve_outlier_concept(lower) |
| if result is not None: |
| return result |
|
|
| |
| result = _solve_labeled_set_comparison(text, lower) |
| if result is not None: |
| return result |
|
|
| result = _solve_braced_set_comparison(text, lower) |
| if result is not None: |
| return result |
|
|
| result = _solve_same_mean_spread_concept(lower) |
| if result is not None: |
| return result |
|
|
| result = _solve_symmetric_spacing_concept(text, lower) |
| if result is not None: |
| return result |
|
|
| |
| result = _solve_direct_numeric(nums, lower) |
| if result is not None: |
| return result |
|
|
| |
| return _build_result( |
| solved=False, |
| answer_value="not fully resolved", |
| internal_answer=None, |
| steps=[ |
| "This looks like a standard deviation question, so focus on spread around the mean.", |
| "Check whether the task is about a transformation, a comparison of spreads, or an exact computation.", |
| "If you want exact solving coverage for a missed pattern, add a dedicated parsing block for that wording.", |
| ], |
| ) |