| import os |
| import json |
| import re |
| from glob import glob |
|
|
| |
| DATA_ROOT = 'data/concept_difficulty_augment/Qwen__Qwen2.5-7B-Instruct/abstract_algebra/harder' |
| OUTPUT_ROOT = os.path.join(DATA_ROOT, 'extracted_entries') |
|
|
| |
| QUESTION_JSON_RE = re.compile(r'\{\s*"question"\s*:\s*".*?",\s*"options"\s*:\s*\{.*?\}\s*\}', re.DOTALL) |
|
|
| |
| all_results_files = glob(os.path.join(DATA_ROOT, 'all_results.json')) |
|
|
| os.makedirs(OUTPUT_ROOT, exist_ok=True) |
|
|
| for results_path in all_results_files: |
| with open(results_path, 'r', encoding='utf-8') as f: |
| results = json.load(f) |
| for idx, entry in enumerate(results): |
| model_output = entry.get('model_output', '') |
| |
| match = QUESTION_JSON_RE.search(model_output) |
| if not match: |
| continue |
| try: |
| parsed = json.loads(match.group()) |
| except Exception as e: |
| continue |
| |
| fname = f'question_{idx:04d}.json' |
| with open(os.path.join(OUTPUT_ROOT, fname), 'w', encoding='utf-8') as f: |
| json.dump(parsed, f, ensure_ascii=False, indent=2) |
| print(f'Extracted {len(results)} entries to {OUTPUT_ROOT}/') |