Spaces:

leave-everything
/

ChoTensai_V3

Sleeping

File size: 3,137 Bytes

c71b90b

#!/usr/bin/env python3
"""

v1.8.1 JSON → QuestionDatabase CSV 変換スクリプト



使用方法:

    python scripts/convert_to_csv.py



出力:

    knowledge/v1.8.1/QuestionDatabase.csv



answer_id変換:

    JP01_001, JP02_001, ... → jp_001, jp_002, ...（教科ごと通し番号）

"""

import json
import csv
from pathlib import Path

# 設定
INPUT_DIR = Path("knowledge/v1.8.1")
OUTPUT_FILE = INPUT_DIR / "QuestionDatabase.csv"

# 教科とディレクトリのマッピング
SUBJECT_DIRS = ["jp", "math", "sci", "soc"]

# CSVヘッダー（QuestionDatabaseシートと一致）
CSV_HEADER = [
    "answer_id",
    "subject",
    "category",
    "answer",
    "question_hint",
    "difficulty",
    "source_context",
    "usage_count"
]


def process_subject(subject: str) -> list:
    """教科のJSONファイルを処理し、通し番号のanswer_idを付与"""
    subject_dir = INPUT_DIR / subject
    if not subject_dir.exists():
        print(f"警告: {subject_dir} が見つかりません")
        return []

    json_files = sorted(subject_dir.glob("*.json"))
    rows = []
    counter = 1  # 教科ごとの通し番号

    for json_file in json_files:
        category = json_file.stem  # JP01, MA03 など

        with open(json_file, "r", encoding="utf-8") as f:
            questions = json.load(f)

        for q in questions:
            # 通し番号でanswer_idを生成
            new_id = f"{subject}_{counter:03d}"

            row = {
                "answer_id": new_id,
                "subject": subject,
                "category": category,
                "answer": q.get("answer", ""),
                "question_hint": q.get("question_hint", ""),
                "difficulty": q.get("difficulty", ""),
                "source_context": q.get("source_context", ""),
                "usage_count": 0
            }
            rows.append(row)
            counter += 1

        print(f"  {json_file.name}: {len(questions)}問")

    return rows


def main():
    all_rows = []
    stats = {}

    for subject in SUBJECT_DIRS:
        print(f"\n【{subject}】")
        rows = process_subject(subject)
        all_rows.extend(rows)
        stats[subject] = len(rows)
        print(f"  小計: {len(rows)}問")

    # CSV出力
    with open(OUTPUT_FILE, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=CSV_HEADER)
        writer.writeheader()
        writer.writerows(all_rows)

    # 結果表示
    print("\n" + "=" * 50)
    print(f"出力: {OUTPUT_FILE}")
    print(f"総問題数: {len(all_rows)}問")
    print("\n教科別内訳:")
    for subject, count in stats.items():
        print(f"  {subject}: {count}問")

    # answer_id範囲表示
    print("\nanswer_id範囲:")
    start = 1
    for subject in SUBJECT_DIRS:
        end = start + stats[subject] - 1
        print(f"  {subject}: {subject}_{start:03d} - {subject}_{end:03d}")
        start = 1  # 各教科ごとに1から開始


if __name__ == "__main__":
    main()