#!/usr/bin/env python3 """ v1.8.1 JSON → QuestionDatabase CSV 変換スクリプト 使用方法: python scripts/convert_to_csv.py 出力: knowledge/v1.8.1/QuestionDatabase.csv answer_id変換: JP01_001, JP02_001, ... → jp_001, jp_002, ...(教科ごと通し番号) """ import json import csv from pathlib import Path # 設定 INPUT_DIR = Path("knowledge/v1.8.1") OUTPUT_FILE = INPUT_DIR / "QuestionDatabase.csv" # 教科とディレクトリのマッピング SUBJECT_DIRS = ["jp", "math", "sci", "soc"] # CSVヘッダー(QuestionDatabaseシートと一致) CSV_HEADER = [ "answer_id", "subject", "category", "answer", "question_hint", "difficulty", "source_context", "usage_count" ] def process_subject(subject: str) -> list: """教科のJSONファイルを処理し、通し番号のanswer_idを付与""" subject_dir = INPUT_DIR / subject if not subject_dir.exists(): print(f"警告: {subject_dir} が見つかりません") return [] json_files = sorted(subject_dir.glob("*.json")) rows = [] counter = 1 # 教科ごとの通し番号 for json_file in json_files: category = json_file.stem # JP01, MA03 など with open(json_file, "r", encoding="utf-8") as f: questions = json.load(f) for q in questions: # 通し番号でanswer_idを生成 new_id = f"{subject}_{counter:03d}" row = { "answer_id": new_id, "subject": subject, "category": category, "answer": q.get("answer", ""), "question_hint": q.get("question_hint", ""), "difficulty": q.get("difficulty", ""), "source_context": q.get("source_context", ""), "usage_count": 0 } rows.append(row) counter += 1 print(f" {json_file.name}: {len(questions)}問") return rows def main(): all_rows = [] stats = {} for subject in SUBJECT_DIRS: print(f"\n【{subject}】") rows = process_subject(subject) all_rows.extend(rows) stats[subject] = len(rows) print(f" 小計: {len(rows)}問") # CSV出力 with open(OUTPUT_FILE, "w", encoding="utf-8", newline="") as f: writer = csv.DictWriter(f, fieldnames=CSV_HEADER) writer.writeheader() writer.writerows(all_rows) # 結果表示 print("\n" + "=" * 50) print(f"出力: {OUTPUT_FILE}") print(f"総問題数: {len(all_rows)}問") print("\n教科別内訳:") for subject, count in stats.items(): print(f" {subject}: {count}問") # answer_id範囲表示 print("\nanswer_id範囲:") start = 1 for subject in SUBJECT_DIRS: end = start + stats[subject] - 1 print(f" {subject}: {subject}_{start:03d} - {subject}_{end:03d}") start = 1 # 各教科ごとに1から開始 if __name__ == "__main__": main()