| import json |
| import re |
| from pathlib import Path |
|
|
| INPUT_PATH = Path("data/gmat_questions.json") |
| OUTPUT_PATH = Path("data/question_support_bank.jsonl") |
|
|
|
|
| |
| |
| |
| def extract_numbers(text): |
| return re.findall(r"\d+\.?\d*", text) |
|
|
|
|
| def detect_topic(q): |
| text = q["questionText"].lower() |
|
|
| if "%" in text or "percent" in text: |
| return "percent" |
|
|
| if "ratio" in text or ":" in text: |
| return "ratio" |
|
|
| if "probability" in text or "chance" in text: |
| return "probability" |
|
|
| if any(x in text for x in ["mean", "average", "median", "data", "variance"]): |
| return "statistics" |
|
|
| if any(x in text for x in ["area", "circle", "triangle", "perimeter"]): |
| return "geometry" |
|
|
| if re.search(r"[a-z]\s*[\+\-\*/=]", text): |
| return "algebra" |
|
|
| return "general" |
|
|
|
|
| |
| |
| |
| def percent_template(q, nums): |
| return { |
| "first_step": "Treat the original value as 100 unless a specific number is easier.", |
| "hint_1": "Focus on how the percentage is applied — is it increase, decrease, or part of a whole?", |
| "hint_2": "Convert the percentage into a multiplier (e.g. +20% → ×1.2, -20% → ×0.8).", |
| "hint_3": "Apply each percentage step in order — don’t combine them directly.", |
| "walkthrough_steps": [ |
| "Start with an easy base value (like 100).", |
| "Apply the first percentage change.", |
| "Apply the second change to the new value.", |
| "Compare the result with the original." |
| ], |
| "method_explanation": [ |
| "Percent changes are multiplicative, not additive.", |
| "Each change affects the updated value.", |
| "Using 100 simplifies calculations." |
| ], |
| "common_trap": "Adding/subtracting percentages directly instead of applying sequential changes." |
| } |
|
|
|
|
| def algebra_template(q, nums): |
| text = q["questionText"] |
|
|
| |
| match = re.search(r"([^\?]+)=([^\?]+)", text) |
| |
| if match: |
| lhs = match.group(1).strip() |
| rhs = match.group(2).strip() |
|
|
| return { |
| "first_step": f"Start with the equation: {lhs} = {rhs}", |
| "hint_1": "Focus on isolating the variable.", |
| "hint_2": "Undo addition/subtraction first.", |
| "hint_3": "Then undo multiplication/division.", |
| "walkthrough_steps": [ |
| f"Start with: {lhs} = {rhs}", |
| "Move constants to one side.", |
| "Undo multiplication/division.", |
| "Solve for the variable." |
| ], |
| "method_explanation": [ |
| "Solve by isolating the variable.", |
| "Reverse operations step by step.", |
| "Keep both sides balanced." |
| ], |
| "common_trap": "Forgetting to reverse operations in the correct order." |
| } |
|
|
| |
| return { |
| "first_step": "Identify the variable and isolate it step by step.", |
| "hint_1": "Look at what operations are applied to the variable.", |
| "hint_2": "Undo operations in reverse order.", |
| "hint_3": "Keep both sides balanced while simplifying.", |
| "walkthrough_steps": [ |
| "Identify the variable.", |
| "Move constants to one side.", |
| "Undo multiplication/division.", |
| "Simplify to isolate the variable." |
| ], |
| "method_explanation": [ |
| "Algebra problems require isolating the variable.", |
| "Reverse operations systematically." |
| ], |
| "common_trap": "Forgetting to apply operations to both sides." |
| } |
|
|
| def ratio_template(q, nums): |
| return { |
| "first_step": "Break the ratio into total parts.", |
| "hint_1": "Add the ratio parts together.", |
| "hint_2": "Find the value of one part.", |
| "hint_3": "Scale up to get the required quantity.", |
| "walkthrough_steps": [ |
| "Write ratio as parts.", |
| "Sum the parts.", |
| "Divide total by parts.", |
| "Multiply by needed portion." |
| ], |
| "method_explanation": [ |
| "Ratios represent proportional relationships.", |
| "Breaking into equal units simplifies reasoning." |
| ], |
| "common_trap": "Using ratio numbers directly instead of total parts." |
| } |
|
|
|
|
| def probability_template(q, nums): |
| return { |
| "first_step": "Count total outcomes and favorable outcomes.", |
| "hint_1": "How many total possibilities are there?", |
| "hint_2": "How many meet the condition?", |
| "hint_3": "Probability = favorable / total.", |
| "walkthrough_steps": [ |
| "Count total outcomes.", |
| "Count favorable outcomes.", |
| "Divide favorable by total." |
| ], |
| "method_explanation": [ |
| "Probability is a ratio.", |
| "Clear counting is essential." |
| ], |
| "common_trap": "Incorrect counting of outcomes." |
| } |
|
|
|
|
| def statistics_template(q, nums): |
| return { |
| "first_step": "Identify what measure is being asked (mean, median, etc.).", |
| "hint_1": "Write out the numbers clearly.", |
| "hint_2": "Apply the correct formula.", |
| "hint_3": "Check your calculation.", |
| "walkthrough_steps": [ |
| "List values.", |
| "Apply formula (mean, median, etc.).", |
| "Compute carefully." |
| ], |
| "method_explanation": [ |
| "Different measures describe data differently.", |
| "Mean = sum / count." |
| ], |
| "common_trap": "Using the wrong measure." |
| } |
|
|
|
|
| def geometry_template(q, nums): |
| return { |
| "first_step": "Identify the shape and formula needed.", |
| "hint_1": "Recall the relevant formula.", |
| "hint_2": "Substitute values carefully.", |
| "hint_3": "Solve step by step.", |
| "walkthrough_steps": [ |
| "Identify formula.", |
| "Substitute values.", |
| "Compute result." |
| ], |
| "method_explanation": [ |
| "Geometry relies on standard formulas.", |
| "Careful substitution avoids mistakes." |
| ], |
| "common_trap": "Using the wrong formula." |
| } |
|
|
|
|
| def general_template(q, nums): |
| return { |
| "first_step": "Break the question into known and unknown parts.", |
| "hint_1": "What is being asked?", |
| "hint_2": "What information is given?", |
| "hint_3": "How can you link them mathematically?", |
| "walkthrough_steps": [ |
| "Understand the problem.", |
| "Identify variables.", |
| "Set up relationships.", |
| "Solve step by step." |
| ], |
| "method_explanation": [ |
| "Translate words into math.", |
| "Solve systematically." |
| ], |
| "common_trap": "Misinterpreting the question." |
| } |
|
|
|
|
| |
| |
| |
| def generate_support(q): |
| nums = extract_numbers(q["questionText"]) |
| topic = detect_topic(q) |
|
|
| if topic == "percent": |
| template = percent_template(q, nums) |
| elif topic == "algebra": |
| template = algebra_template(q, nums) |
| elif topic == "ratio": |
| template = ratio_template(q, nums) |
| elif topic == "probability": |
| template = probability_template(q, nums) |
| elif topic == "statistics": |
| template = statistics_template(q, nums) |
| elif topic == "geometry": |
| template = geometry_template(q, nums) |
| else: |
| template = general_template(q, nums) |
|
|
| return { |
| "question_id": q["id"], |
| "topic": topic, |
| "stem": q["questionText"], |
| "choices": q["answers"], |
| "correct_option": q["correctIndex"], |
| **template |
| } |
|
|
|
|
| |
| |
| |
| def main(): |
| with open(INPUT_PATH, "r", encoding="utf-8") as f: |
| data = json.load(f) |
|
|
| questions = data["items"] |
|
|
| with open(OUTPUT_PATH, "w", encoding="utf-8") as out: |
| for q in questions: |
| support = generate_support(q) |
| out.write(json.dumps(support) + "\n") |
|
|
| print(f"Generated support bank → {OUTPUT_PATH}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |