import json import re from pathlib import Path INPUT_PATH = Path("data/gmat_questions.json") OUTPUT_PATH = Path("data/question_support_bank.jsonl") # ---------------------------- # Utilities # ---------------------------- def extract_numbers(text): return re.findall(r"\d+\.?\d*", text) def detect_topic(q): text = q["questionText"].lower() if "%" in text or "percent" in text: return "percent" if "ratio" in text or ":" in text: return "ratio" if "probability" in text or "chance" in text: return "probability" if any(x in text for x in ["mean", "average", "median", "data", "variance"]): return "statistics" if any(x in text for x in ["area", "circle", "triangle", "perimeter"]): return "geometry" if re.search(r"[a-z]\s*[\+\-\*/=]", text): return "algebra" return "general" # ---------------------------- # Smart Templates # ---------------------------- def percent_template(q, nums): return { "first_step": "Treat the original value as 100 unless a specific number is easier.", "hint_1": "Focus on how the percentage is applied — is it increase, decrease, or part of a whole?", "hint_2": "Convert the percentage into a multiplier (e.g. +20% → ×1.2, -20% → ×0.8).", "hint_3": "Apply each percentage step in order — don’t combine them directly.", "walkthrough_steps": [ "Start with an easy base value (like 100).", "Apply the first percentage change.", "Apply the second change to the new value.", "Compare the result with the original." ], "method_explanation": [ "Percent changes are multiplicative, not additive.", "Each change affects the updated value.", "Using 100 simplifies calculations." ], "common_trap": "Adding/subtracting percentages directly instead of applying sequential changes." } def algebra_template(q, nums): text = q["questionText"] # Try to extract equation match = re.search(r"([^\?]+)=([^\?]+)", text) if match: lhs = match.group(1).strip() rhs = match.group(2).strip() return { "first_step": f"Start with the equation: {lhs} = {rhs}", "hint_1": "Focus on isolating the variable.", "hint_2": "Undo addition/subtraction first.", "hint_3": "Then undo multiplication/division.", "walkthrough_steps": [ f"Start with: {lhs} = {rhs}", "Move constants to one side.", "Undo multiplication/division.", "Solve for the variable." ], "method_explanation": [ "Solve by isolating the variable.", "Reverse operations step by step.", "Keep both sides balanced." ], "common_trap": "Forgetting to reverse operations in the correct order." } # fallback if parsing fails return { "first_step": "Identify the variable and isolate it step by step.", "hint_1": "Look at what operations are applied to the variable.", "hint_2": "Undo operations in reverse order.", "hint_3": "Keep both sides balanced while simplifying.", "walkthrough_steps": [ "Identify the variable.", "Move constants to one side.", "Undo multiplication/division.", "Simplify to isolate the variable." ], "method_explanation": [ "Algebra problems require isolating the variable.", "Reverse operations systematically." ], "common_trap": "Forgetting to apply operations to both sides." } def ratio_template(q, nums): return { "first_step": "Break the ratio into total parts.", "hint_1": "Add the ratio parts together.", "hint_2": "Find the value of one part.", "hint_3": "Scale up to get the required quantity.", "walkthrough_steps": [ "Write ratio as parts.", "Sum the parts.", "Divide total by parts.", "Multiply by needed portion." ], "method_explanation": [ "Ratios represent proportional relationships.", "Breaking into equal units simplifies reasoning." ], "common_trap": "Using ratio numbers directly instead of total parts." } def probability_template(q, nums): return { "first_step": "Count total outcomes and favorable outcomes.", "hint_1": "How many total possibilities are there?", "hint_2": "How many meet the condition?", "hint_3": "Probability = favorable / total.", "walkthrough_steps": [ "Count total outcomes.", "Count favorable outcomes.", "Divide favorable by total." ], "method_explanation": [ "Probability is a ratio.", "Clear counting is essential." ], "common_trap": "Incorrect counting of outcomes." } def statistics_template(q, nums): return { "first_step": "Identify what measure is being asked (mean, median, etc.).", "hint_1": "Write out the numbers clearly.", "hint_2": "Apply the correct formula.", "hint_3": "Check your calculation.", "walkthrough_steps": [ "List values.", "Apply formula (mean, median, etc.).", "Compute carefully." ], "method_explanation": [ "Different measures describe data differently.", "Mean = sum / count." ], "common_trap": "Using the wrong measure." } def geometry_template(q, nums): return { "first_step": "Identify the shape and formula needed.", "hint_1": "Recall the relevant formula.", "hint_2": "Substitute values carefully.", "hint_3": "Solve step by step.", "walkthrough_steps": [ "Identify formula.", "Substitute values.", "Compute result." ], "method_explanation": [ "Geometry relies on standard formulas.", "Careful substitution avoids mistakes." ], "common_trap": "Using the wrong formula." } def general_template(q, nums): return { "first_step": "Break the question into known and unknown parts.", "hint_1": "What is being asked?", "hint_2": "What information is given?", "hint_3": "How can you link them mathematically?", "walkthrough_steps": [ "Understand the problem.", "Identify variables.", "Set up relationships.", "Solve step by step." ], "method_explanation": [ "Translate words into math.", "Solve systematically." ], "common_trap": "Misinterpreting the question." } # ---------------------------- # Router # ---------------------------- def generate_support(q): nums = extract_numbers(q["questionText"]) topic = detect_topic(q) if topic == "percent": template = percent_template(q, nums) elif topic == "algebra": template = algebra_template(q, nums) elif topic == "ratio": template = ratio_template(q, nums) elif topic == "probability": template = probability_template(q, nums) elif topic == "statistics": template = statistics_template(q, nums) elif topic == "geometry": template = geometry_template(q, nums) else: template = general_template(q, nums) return { "question_id": q["id"], "topic": topic, "stem": q["questionText"], "choices": q["answers"], "correct_option": q["correctIndex"], # internal use **template } # ---------------------------- # Main # ---------------------------- def main(): with open(INPUT_PATH, "r", encoding="utf-8") as f: data = json.load(f) questions = data["items"] with open(OUTPUT_PATH, "w", encoding="utf-8") as out: for q in questions: support = generate_support(q) out.write(json.dumps(support) + "\n") print(f"Generated support bank → {OUTPUT_PATH}") if __name__ == "__main__": main()