Spaces:

j-js
/

GameAI

Sleeping

File size: 8,336 Bytes

import json
import re
from pathlib import Path

INPUT_PATH = Path("data/gmat_questions.json")
OUTPUT_PATH = Path("data/question_support_bank.jsonl")


# ----------------------------
# Utilities
# ----------------------------
def extract_numbers(text):
    return re.findall(r"\d+\.?\d*", text)


def detect_topic(q):
    text = q["questionText"].lower()

    if "%" in text or "percent" in text:
        return "percent"

    if "ratio" in text or ":" in text:
        return "ratio"

    if "probability" in text or "chance" in text:
        return "probability"

    if any(x in text for x in ["mean", "average", "median", "data", "variance"]):
        return "statistics"

    if any(x in text for x in ["area", "circle", "triangle", "perimeter"]):
        return "geometry"

    if re.search(r"[a-z]\s*[\+\-\*/=]", text):
        return "algebra"

    return "general"


# ----------------------------
# Smart Templates
# ----------------------------
def percent_template(q, nums):
    return {
        "first_step": "Treat the original value as 100 unless a specific number is easier.",
        "hint_1": "Focus on how the percentage is applied — is it increase, decrease, or part of a whole?",
        "hint_2": "Convert the percentage into a multiplier (e.g. +20% → ×1.2, -20% → ×0.8).",
        "hint_3": "Apply each percentage step in order — don’t combine them directly.",
        "walkthrough_steps": [
            "Start with an easy base value (like 100).",
            "Apply the first percentage change.",
            "Apply the second change to the new value.",
            "Compare the result with the original."
        ],
        "method_explanation": [
            "Percent changes are multiplicative, not additive.",
            "Each change affects the updated value.",
            "Using 100 simplifies calculations."
        ],
        "common_trap": "Adding/subtracting percentages directly instead of applying sequential changes."
    }


def algebra_template(q, nums):
    text = q["questionText"]

    # Try to extract equation
    match = re.search(r"([^\?]+)=([^\?]+)", text)
    
    if match:
        lhs = match.group(1).strip()
        rhs = match.group(2).strip()

        return {
            "first_step": f"Start with the equation: {lhs} = {rhs}",
            "hint_1": "Focus on isolating the variable.",
            "hint_2": "Undo addition/subtraction first.",
            "hint_3": "Then undo multiplication/division.",
            "walkthrough_steps": [
                f"Start with: {lhs} = {rhs}",
                "Move constants to one side.",
                "Undo multiplication/division.",
                "Solve for the variable."
            ],
            "method_explanation": [
                "Solve by isolating the variable.",
                "Reverse operations step by step.",
                "Keep both sides balanced."
            ],
            "common_trap": "Forgetting to reverse operations in the correct order."
        }

    # fallback if parsing fails
    return {
        "first_step": "Identify the variable and isolate it step by step.",
        "hint_1": "Look at what operations are applied to the variable.",
        "hint_2": "Undo operations in reverse order.",
        "hint_3": "Keep both sides balanced while simplifying.",
        "walkthrough_steps": [
            "Identify the variable.",
            "Move constants to one side.",
            "Undo multiplication/division.",
            "Simplify to isolate the variable."
        ],
        "method_explanation": [
            "Algebra problems require isolating the variable.",
            "Reverse operations systematically."
        ],
        "common_trap": "Forgetting to apply operations to both sides."
    }

def ratio_template(q, nums):
    return {
        "first_step": "Break the ratio into total parts.",
        "hint_1": "Add the ratio parts together.",
        "hint_2": "Find the value of one part.",
        "hint_3": "Scale up to get the required quantity.",
        "walkthrough_steps": [
            "Write ratio as parts.",
            "Sum the parts.",
            "Divide total by parts.",
            "Multiply by needed portion."
        ],
        "method_explanation": [
            "Ratios represent proportional relationships.",
            "Breaking into equal units simplifies reasoning."
        ],
        "common_trap": "Using ratio numbers directly instead of total parts."
    }


def probability_template(q, nums):
    return {
        "first_step": "Count total outcomes and favorable outcomes.",
        "hint_1": "How many total possibilities are there?",
        "hint_2": "How many meet the condition?",
        "hint_3": "Probability = favorable / total.",
        "walkthrough_steps": [
            "Count total outcomes.",
            "Count favorable outcomes.",
            "Divide favorable by total."
        ],
        "method_explanation": [
            "Probability is a ratio.",
            "Clear counting is essential."
        ],
        "common_trap": "Incorrect counting of outcomes."
    }


def statistics_template(q, nums):
    return {
        "first_step": "Identify what measure is being asked (mean, median, etc.).",
        "hint_1": "Write out the numbers clearly.",
        "hint_2": "Apply the correct formula.",
        "hint_3": "Check your calculation.",
        "walkthrough_steps": [
            "List values.",
            "Apply formula (mean, median, etc.).",
            "Compute carefully."
        ],
        "method_explanation": [
            "Different measures describe data differently.",
            "Mean = sum / count."
        ],
        "common_trap": "Using the wrong measure."
    }


def geometry_template(q, nums):
    return {
        "first_step": "Identify the shape and formula needed.",
        "hint_1": "Recall the relevant formula.",
        "hint_2": "Substitute values carefully.",
        "hint_3": "Solve step by step.",
        "walkthrough_steps": [
            "Identify formula.",
            "Substitute values.",
            "Compute result."
        ],
        "method_explanation": [
            "Geometry relies on standard formulas.",
            "Careful substitution avoids mistakes."
        ],
        "common_trap": "Using the wrong formula."
    }


def general_template(q, nums):
    return {
        "first_step": "Break the question into known and unknown parts.",
        "hint_1": "What is being asked?",
        "hint_2": "What information is given?",
        "hint_3": "How can you link them mathematically?",
        "walkthrough_steps": [
            "Understand the problem.",
            "Identify variables.",
            "Set up relationships.",
            "Solve step by step."
        ],
        "method_explanation": [
            "Translate words into math.",
            "Solve systematically."
        ],
        "common_trap": "Misinterpreting the question."
    }


# ----------------------------
# Router
# ----------------------------
def generate_support(q):
    nums = extract_numbers(q["questionText"])
    topic = detect_topic(q)

    if topic == "percent":
        template = percent_template(q, nums)
    elif topic == "algebra":
        template = algebra_template(q, nums)
    elif topic == "ratio":
        template = ratio_template(q, nums)
    elif topic == "probability":
        template = probability_template(q, nums)
    elif topic == "statistics":
        template = statistics_template(q, nums)
    elif topic == "geometry":
        template = geometry_template(q, nums)
    else:
        template = general_template(q, nums)

    return {
        "question_id": q["id"],
        "topic": topic,
        "stem": q["questionText"],
        "choices": q["answers"],
        "correct_option": q["correctIndex"],  # internal use
        **template
    }


# ----------------------------
# Main
# ----------------------------
def main():
    with open(INPUT_PATH, "r", encoding="utf-8") as f:
        data = json.load(f)

    questions = data["items"]

    with open(OUTPUT_PATH, "w", encoding="utf-8") as out:
        for q in questions:
            support = generate_support(q)
            out.write(json.dumps(support) + "\n")

    print(f"Generated support bank → {OUTPUT_PATH}")


if __name__ == "__main__":
    main()