import os import json import re from glob import glob # 目标根目录(可根据需要修改) DATA_ROOT = 'data/concept_difficulty_augment/Qwen__Qwen2.5-7B-Instruct/abstract_algebra/harder' OUTPUT_ROOT = os.path.join(DATA_ROOT, 'extracted_entries') # 正则表达式,提取 JSON 格式的 question/options QUESTION_JSON_RE = re.compile(r'\{\s*"question"\s*:\s*".*?",\s*"options"\s*:\s*\{.*?\}\s*\}', re.DOTALL) # 递归查找所有 all_results.json all_results_files = glob(os.path.join(DATA_ROOT, 'all_results.json')) os.makedirs(OUTPUT_ROOT, exist_ok=True) for results_path in all_results_files: with open(results_path, 'r', encoding='utf-8') as f: results = json.load(f) for idx, entry in enumerate(results): model_output = entry.get('model_output', '') # 只提取第一个合法 JSON 片段 match = QUESTION_JSON_RE.search(model_output) if not match: continue # 跳过无法提取的 try: parsed = json.loads(match.group()) except Exception as e: continue # 跳过解析失败的 # 保存为 question_XXXX.json fname = f'question_{idx:04d}.json' with open(os.path.join(OUTPUT_ROOT, fname), 'w', encoding='utf-8') as f: json.dump(parsed, f, ensure_ascii=False, indent=2) print(f'Extracted {len(results)} entries to {OUTPUT_ROOT}/')