ChoTensai_V3 / scripts /convert_to_csv.py
TOMOCHIN4
release: v2.0.0 STABLE リリース完了
c71b90b
#!/usr/bin/env python3
"""
v1.8.1 JSON → QuestionDatabase CSV 変換スクリプト
使用方法:
python scripts/convert_to_csv.py
出力:
knowledge/v1.8.1/QuestionDatabase.csv
answer_id変換:
JP01_001, JP02_001, ... → jp_001, jp_002, ...(教科ごと通し番号)
"""
import json
import csv
from pathlib import Path
# 設定
INPUT_DIR = Path("knowledge/v1.8.1")
OUTPUT_FILE = INPUT_DIR / "QuestionDatabase.csv"
# 教科とディレクトリのマッピング
SUBJECT_DIRS = ["jp", "math", "sci", "soc"]
# CSVヘッダー(QuestionDatabaseシートと一致)
CSV_HEADER = [
"answer_id",
"subject",
"category",
"answer",
"question_hint",
"difficulty",
"source_context",
"usage_count"
]
def process_subject(subject: str) -> list:
"""教科のJSONファイルを処理し、通し番号のanswer_idを付与"""
subject_dir = INPUT_DIR / subject
if not subject_dir.exists():
print(f"警告: {subject_dir} が見つかりません")
return []
json_files = sorted(subject_dir.glob("*.json"))
rows = []
counter = 1 # 教科ごとの通し番号
for json_file in json_files:
category = json_file.stem # JP01, MA03 など
with open(json_file, "r", encoding="utf-8") as f:
questions = json.load(f)
for q in questions:
# 通し番号でanswer_idを生成
new_id = f"{subject}_{counter:03d}"
row = {
"answer_id": new_id,
"subject": subject,
"category": category,
"answer": q.get("answer", ""),
"question_hint": q.get("question_hint", ""),
"difficulty": q.get("difficulty", ""),
"source_context": q.get("source_context", ""),
"usage_count": 0
}
rows.append(row)
counter += 1
print(f" {json_file.name}: {len(questions)}問")
return rows
def main():
all_rows = []
stats = {}
for subject in SUBJECT_DIRS:
print(f"\n【{subject}】")
rows = process_subject(subject)
all_rows.extend(rows)
stats[subject] = len(rows)
print(f" 小計: {len(rows)}問")
# CSV出力
with open(OUTPUT_FILE, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=CSV_HEADER)
writer.writeheader()
writer.writerows(all_rows)
# 結果表示
print("\n" + "=" * 50)
print(f"出力: {OUTPUT_FILE}")
print(f"総問題数: {len(all_rows)}問")
print("\n教科別内訳:")
for subject, count in stats.items():
print(f" {subject}: {count}問")
# answer_id範囲表示
print("\nanswer_id範囲:")
start = 1
for subject in SUBJECT_DIRS:
end = start + stats[subject] - 1
print(f" {subject}: {subject}_{start:03d} - {subject}_{end:03d}")
start = 1 # 各教科ごとに1から開始
if __name__ == "__main__":
main()