File size: 15,657 Bytes
4801f8d
 
b45f0ac
4801f8d
 
b45f0ac
4801f8d
 
 
 
b45f0ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4801f8d
b45f0ac
4801f8d
 
b45f0ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4801f8d
b45f0ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4801f8d
 
b45f0ac
 
 
 
 
 
 
4801f8d
b45f0ac
 
 
4801f8d
b45f0ac
 
4801f8d
b45f0ac
 
 
4801f8d
 
 
b45f0ac
 
4801f8d
b45f0ac
 
4801f8d
b45f0ac
 
 
4801f8d
 
 
b45f0ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
from __future__ import annotations

import math
import re
from statistics import pstdev
from typing import Optional, List, Tuple

from models import SolverResult


# -----------------------------
# Basic parsing helpers
# -----------------------------

_NUMBER_RE = r"-?\d+(?:\.\d+)?"

_STD_PHRASES = [
    "standard deviation",
    "std dev",
    "std. dev",
    "stdev",
    "sd ",
    " s.d.",
]

_COMPARE_WORDS = [
    "greater",
    "larger",
    "higher",
    "smaller",
    "lower",
    "less",
    "same",
    "equal",
    "compare",
    "comparison",
]

_SET_LABEL_RE = re.compile(
    rf"""
    (?:
        \b([A-Z])\b\s*[:=]\s*                    # A: 1,2,3
        |
        \bset\s+([A-Z])\b\s*[:=]?\s*             # Set A: 1,2,3
        |
        \bgroup\s+([A-Z])\b\s*[:=]?\s*           # Group A: 1,2,3
    )
    ([^\n;|]+)
    """,
    re.IGNORECASE | re.VERBOSE,
)


def _clean(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "").strip().lower())


def _nums(text: str) -> List[float]:
    return [float(x) for x in re.findall(_NUMBER_RE, text)]


def _is_close(a: float, b: float, tol: float = 1e-9) -> bool:
    return abs(a - b) <= tol


def _all_equal(vals: List[float]) -> bool:
    return bool(vals) and all(_is_close(v, vals[0]) for v in vals)


def _mean(vals: List[float]) -> float:
    return sum(vals) / len(vals)


def _spread_score(vals: List[float]) -> float:
    """
    Cheap comparison proxy for spread. For same-length sets,
    pstdev is best, but this helper can still support quick comparisons.
    """
    if not vals:
        return 0.0
    return pstdev(vals)


def _safe_number_text(x: float) -> str:
    if _is_close(x, round(x)):
        return str(int(round(x)))
    return f"{x:.6g}"


def _mentions_standard_deviation(lower: str) -> bool:
    return any(p in lower for p in _STD_PHRASES)


def _mentions_variability(lower: str) -> bool:
    return any(
        p in lower
        for p in [
            "spread",
            "more spread out",
            "less spread out",
            "dispersion",
            "variability",
            "variation",
        ]
    )


def _extract_labeled_sets(text: str) -> List[Tuple[str, List[float]]]:
    sets: List[Tuple[str, List[float]]] = []
    for m in _SET_LABEL_RE.finditer(text):
        label = (m.group(1) or m.group(2) or "").upper()
        body = m.group(3)
        nums = _nums(body)
        if len(nums) >= 2:
            sets.append((label, nums))
    return sets


def _extract_braced_sets(text: str) -> List[List[float]]:
    groups = re.findall(r"\{([^{}]+)\}|\(([^()]+)\)|\[([^\[\]]+)\]", text)
    out: List[List[float]] = []
    for g in groups:
        body = next((part for part in g if part), "")
        nums = _nums(body)
        if len(nums) >= 2:
            out.append(nums)
    return out


def _describe_shift_rule() -> List[str]:
    return [
        "Adding or subtracting the same constant shifts every value equally.",
        "That changes the center, but not the spread.",
        "So the standard deviation stays unchanged.",
    ]


def _describe_scale_rule(factor: float) -> List[str]:
    return [
        "Multiplying or dividing every value rescales every distance from the mean by the same factor.",
        f"So the standard deviation is multiplied by |{_safe_number_text(factor)}|.",
        "The key idea is that spread scales with the absolute value of the multiplier.",
    ]


def _build_result(
    *,
    solved: bool,
    internal_answer: Optional[str],
    steps: List[str],
    answer_value: Optional[str] = None,
) -> SolverResult:
    # Keep answer_value intentionally non-revealing for direct numeric solves.
    return SolverResult(
        domain="quant",
        solved=solved,
        topic="standard_deviation",
        answer_value=answer_value if answer_value is not None else "computed internally",
        internal_answer=internal_answer,
        steps=steps,
    )


# -----------------------------
# Pattern detectors
# -----------------------------

def _detect_add_sub_constant(lower: str) -> bool:
    return any(
        p in lower
        for p in [
            "add the same",
            "added the same",
            "increased by the same",
            "decreased by the same",
            "plus a constant",
            "minus a constant",
            "subtract the same",
            "subtracted the same",
            "add 5 to every",
            "subtract 5 from every",
            "each value is increased by",
            "each value is decreased by",
            "every value is increased by",
            "every value is decreased by",
        ]
    )


def _detect_scaling(lower: str) -> Optional[float]:
    patterns = [
        r"(?:multiplied by|scaled by|times)\s*(" + _NUMBER_RE + r")",
        r"(?:each|every)\s+value\s+(?:is\s+)?multiplied\s+by\s*(" + _NUMBER_RE + r")",
        r"(?:each|every)\s+value\s+(?:is\s+)?divided\s+by\s*(" + _NUMBER_RE + r")",
    ]

    for pat in patterns:
        m = re.search(pat, lower)
        if m:
            val = float(m.group(1))
            if "divided by" in m.group(0):
                if not _is_close(val, 0.0):
                    return 1.0 / val
            return val

    # Percent scaling language
    m = re.search(r"(increase|decrease)\s+by\s+(\d+(?:\.\d+)?)\s*percent", lower)
    if m:
        pct = float(m.group(2)) / 100.0
        if m.group(1) == "increase":
            return 1.0 + pct
        return 1.0 - pct

    return None


def _detect_zero_sd_prompt(lower: str) -> bool:
    return any(
        p in lower
        for p in [
            "standard deviation is 0",
            "std dev is 0",
            "zero standard deviation",
            "when is the standard deviation zero",
        ]
    )


def _detect_outlier_prompt(lower: str) -> bool:
    return "outlier" in lower or "extreme value" in lower


def _detect_same_mean_diff_spread(lower: str) -> bool:
    return (
        ("same mean" in lower or "equal mean" in lower)
        and any(p in lower for p in ["more spread", "less spread", "farther from the mean", "closer to the mean"])
    )


def _detect_compare_sets(lower: str) -> bool:
    return any(w in lower for w in _COMPARE_WORDS) and (
        "set" in lower or "group" in lower or "list" in lower or "data set" in lower
    )


# -----------------------------
# Solver blocks
# -----------------------------

def _solve_conceptual_constant_shift(lower: str) -> Optional[SolverResult]:
    if not _detect_add_sub_constant(lower):
        return None

    return _build_result(
        solved=True,
        answer_value="unchanged",
        internal_answer="unchanged",
        steps=_describe_shift_rule(),
    )


def _solve_conceptual_scaling(lower: str) -> Optional[SolverResult]:
    factor = _detect_scaling(lower)
    if factor is None:
        return None

    return _build_result(
        solved=True,
        answer_value=f"scaled by |{_safe_number_text(factor)}|",
        internal_answer=f"scaled by |{_safe_number_text(factor)}|",
        steps=_describe_scale_rule(factor),
    )


def _solve_zero_standard_deviation(lower: str, nums: List[float]) -> Optional[SolverResult]:
    if nums and _all_equal(nums):
        return _build_result(
            solved=True,
            answer_value="zero",
            internal_answer="0",
            steps=[
                "All values are identical, so every value is exactly at the mean.",
                "That means every deviation from the mean is 0.",
                "So the standard deviation is 0.",
            ],
        )

    if _detect_zero_sd_prompt(lower):
        return _build_result(
            solved=True,
            answer_value="all values equal",
            internal_answer="standard deviation is zero exactly when all values are equal",
            steps=[
                "Standard deviation measures how far values are from the mean.",
                "It is zero only when every value has zero distance from the mean.",
                "That happens exactly when all values are the same.",
            ],
        )

    return None


def _solve_outlier_concept(lower: str) -> Optional[SolverResult]:
    if not _detect_outlier_prompt(lower):
        return None

    return _build_result(
        solved=True,
        answer_value="typically increases",
        internal_answer="adding or making an outlier more extreme typically increases standard deviation",
        steps=[
            "Standard deviation increases when values lie farther from the mean.",
            "An outlier is an unusually distant value, so it usually increases spread.",
            "So introducing a more extreme outlier typically increases the standard deviation.",
        ],
    )


def _solve_labeled_set_comparison(text: str, lower: str) -> Optional[SolverResult]:
    sets = _extract_labeled_sets(text)

    if len(sets) < 2:
        return None
    if not (_detect_compare_sets(lower) or _mentions_standard_deviation(lower) or _mentions_variability(lower)):
        return None

    scored = [(label, vals, _spread_score(vals)) for label, vals in sets]
    scored_sorted = sorted(scored, key=lambda t: t[2])

    smallest = scored_sorted[0]
    largest = scored_sorted[-1]

    if _is_close(smallest[2], largest[2]):
        answer = "equal"
        internal = "equal standard deviation"
        steps = [
            "Compare how far each set’s values lie from its own mean.",
            "After measuring the spreads, the sets have equal spread.",
            "So their standard deviations are equal.",
        ]
    else:
        wants_small = any(w in lower for w in ["smaller", "lower", "less"])
        chosen = smallest if wants_small else largest
        answer = chosen[0]
        internal = chosen[0]
        steps = [
            "For comparison questions, focus on spread rather than just the mean.",
            "The set whose values sit farther from its mean has the larger standard deviation.",
            f"Internal comparison identifies set {chosen[0]} as the correct choice.",
        ]

    return _build_result(
        solved=True,
        answer_value=answer,
        internal_answer=internal,
        steps=steps,
    )


def _solve_braced_set_comparison(text: str, lower: str) -> Optional[SolverResult]:
    sets = _extract_braced_sets(text)
    if len(sets) != 2:
        return None
    if not (_detect_compare_sets(lower) or "which" in lower):
        return None

    s1 = _spread_score(sets[0])
    s2 = _spread_score(sets[1])

    if _is_close(s1, s2):
        answer = "equal"
        internal = "equal standard deviation"
    else:
        wants_small = any(w in lower for w in ["smaller", "lower", "less"])
        if wants_small:
            answer = "first set" if s1 < s2 else "second set"
            internal = answer
        else:
            answer = "first set" if s1 > s2 else "second set"
            internal = answer

    return _build_result(
        solved=True,
        answer_value=answer,
        internal_answer=internal,
        steps=[
            "Compare distance from each set’s mean, not just the raw values.",
            "The more spread-out set has the larger standard deviation.",
            "The choice above is determined internally from that spread comparison.",
        ],
    )


def _solve_same_mean_spread_concept(lower: str) -> Optional[SolverResult]:
    if not _detect_same_mean_diff_spread(lower):
        return None

    return _build_result(
        solved=True,
        answer_value="the more spread-out set",
        internal_answer="with same mean, the more spread-out set has larger standard deviation",
        steps=[
            "If two sets have the same mean, standard deviation depends on how far values sit from that mean.",
            "Values farther from the mean create larger deviations.",
            "So the more spread-out set has the larger standard deviation.",
        ],
    )


def _solve_symmetric_spacing_concept(text: str, lower: str) -> Optional[SolverResult]:
    # Lightweight conceptual handling for classic GMAT patterns such as:
    # {m-d, m, m+d} vs {m-2d, m, m+2d}
    if "equally spaced" not in lower and "symmetric" not in lower and "centered at" not in lower:
        return None

    nums = _nums(text)
    if len(nums) < 3:
        return None

    return _build_result(
        solved=True,
        answer_value="greater spacing means greater SD",
        internal_answer="for symmetric equally spaced sets, larger common distance from center means larger SD",
        steps=[
            "For symmetric sets, the mean is the center point.",
            "Standard deviation is driven by how far the outer values are from that center.",
            "So if one set has larger equal spacing from the center, it has the larger standard deviation.",
        ],
    )


def _solve_direct_numeric(nums: List[float], lower: str) -> Optional[SolverResult]:
    if len(nums) < 2:
        return None

    # Avoid hijacking transformation questions that happen to include numbers.
    if _detect_add_sub_constant(lower) or _detect_scaling(lower) is not None:
        return None

    sd = pstdev(nums)

    return _build_result(
        solved=True,
        answer_value="computed internally",
        internal_answer=_safe_number_text(sd),
        steps=[
            "Find the mean of the data set.",
            "Measure each value’s distance from the mean and square those distances.",
            "Average those squared deviations, then take the square root.",
            "The exact numeric standard deviation has been computed internally.",
        ],
    )


# -----------------------------
# Public solver
# -----------------------------

def solve_standard_deviation(text: str) -> Optional[SolverResult]:
    lower = _clean(text)

    if not (
        _mentions_standard_deviation(lower)
        or _mentions_variability(lower)
        or "variance" in lower
        or "outlier" in lower
    ):
        return None

    nums = _nums(text)

    # 1. Core conceptual transformations
    for block in (
        _solve_conceptual_constant_shift,
        _solve_conceptual_scaling,
    ):
        result = block(lower)
        if result is not None:
            return result

    # 2. Zero / all-equal concept
    result = _solve_zero_standard_deviation(lower, nums)
    if result is not None:
        return result

    # 3. Outlier concept
    result = _solve_outlier_concept(lower)
    if result is not None:
        return result

    # 4. Comparison-style questions
    result = _solve_labeled_set_comparison(text, lower)
    if result is not None:
        return result

    result = _solve_braced_set_comparison(text, lower)
    if result is not None:
        return result

    result = _solve_same_mean_spread_concept(lower)
    if result is not None:
        return result

    result = _solve_symmetric_spacing_concept(text, lower)
    if result is not None:
        return result

    # 5. Exact numeric computation from a visible list
    result = _solve_direct_numeric(nums, lower)
    if result is not None:
        return result

    # 6. Fallback conceptual explanation
    return _build_result(
        solved=False,
        answer_value="not fully resolved",
        internal_answer=None,
        steps=[
            "This looks like a standard deviation question, so focus on spread around the mean.",
            "Check whether the task is about a transformation, a comparison of spreads, or an exact computation.",
            "If you want exact solving coverage for a missed pattern, add a dedicated parsing block for that wording.",
        ],
    )