Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| v1.8.1 JSON → QuestionDatabase CSV 変換スクリプト | |
| 使用方法: | |
| python scripts/convert_to_csv.py | |
| 出力: | |
| knowledge/v1.8.1/QuestionDatabase.csv | |
| answer_id変換: | |
| JP01_001, JP02_001, ... → jp_001, jp_002, ...(教科ごと通し番号) | |
| """ | |
| import json | |
| import csv | |
| from pathlib import Path | |
| # 設定 | |
| INPUT_DIR = Path("knowledge/v1.8.1") | |
| OUTPUT_FILE = INPUT_DIR / "QuestionDatabase.csv" | |
| # 教科とディレクトリのマッピング | |
| SUBJECT_DIRS = ["jp", "math", "sci", "soc"] | |
| # CSVヘッダー(QuestionDatabaseシートと一致) | |
| CSV_HEADER = [ | |
| "answer_id", | |
| "subject", | |
| "category", | |
| "answer", | |
| "question_hint", | |
| "difficulty", | |
| "source_context", | |
| "usage_count" | |
| ] | |
| def process_subject(subject: str) -> list: | |
| """教科のJSONファイルを処理し、通し番号のanswer_idを付与""" | |
| subject_dir = INPUT_DIR / subject | |
| if not subject_dir.exists(): | |
| print(f"警告: {subject_dir} が見つかりません") | |
| return [] | |
| json_files = sorted(subject_dir.glob("*.json")) | |
| rows = [] | |
| counter = 1 # 教科ごとの通し番号 | |
| for json_file in json_files: | |
| category = json_file.stem # JP01, MA03 など | |
| with open(json_file, "r", encoding="utf-8") as f: | |
| questions = json.load(f) | |
| for q in questions: | |
| # 通し番号でanswer_idを生成 | |
| new_id = f"{subject}_{counter:03d}" | |
| row = { | |
| "answer_id": new_id, | |
| "subject": subject, | |
| "category": category, | |
| "answer": q.get("answer", ""), | |
| "question_hint": q.get("question_hint", ""), | |
| "difficulty": q.get("difficulty", ""), | |
| "source_context": q.get("source_context", ""), | |
| "usage_count": 0 | |
| } | |
| rows.append(row) | |
| counter += 1 | |
| print(f" {json_file.name}: {len(questions)}問") | |
| return rows | |
| def main(): | |
| all_rows = [] | |
| stats = {} | |
| for subject in SUBJECT_DIRS: | |
| print(f"\n【{subject}】") | |
| rows = process_subject(subject) | |
| all_rows.extend(rows) | |
| stats[subject] = len(rows) | |
| print(f" 小計: {len(rows)}問") | |
| # CSV出力 | |
| with open(OUTPUT_FILE, "w", encoding="utf-8", newline="") as f: | |
| writer = csv.DictWriter(f, fieldnames=CSV_HEADER) | |
| writer.writeheader() | |
| writer.writerows(all_rows) | |
| # 結果表示 | |
| print("\n" + "=" * 50) | |
| print(f"出力: {OUTPUT_FILE}") | |
| print(f"総問題数: {len(all_rows)}問") | |
| print("\n教科別内訳:") | |
| for subject, count in stats.items(): | |
| print(f" {subject}: {count}問") | |
| # answer_id範囲表示 | |
| print("\nanswer_id範囲:") | |
| start = 1 | |
| for subject in SUBJECT_DIRS: | |
| end = start + stats[subject] - 1 | |
| print(f" {subject}: {subject}_{start:03d} - {subject}_{end:03d}") | |
| start = 1 # 各教科ごとに1から開始 | |
| if __name__ == "__main__": | |
| main() | |