Pozify / scripts /build_exercise_knowledge_base.py
tthhanh's picture
feat: ingest real exercise dataset into knowledge artifacts
7294c15
Raw
History Blame Contribute Delete
3.25 kB
from __future__ import annotations
import argparse
from pathlib import Path
import sys
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))
from pozify.knowledge_card_dataset_transformer import ( # noqa: E402
write_card_pack,
write_normalized_exercises,
)
DEFAULT_HF_DATASET = "DORTROX/Exercises-Data"
DEFAULT_HF_FILENAME = "dataset.json"
def _download_hf_dataset_file(repo_id: str, filename: str) -> Path:
try:
from huggingface_hub import hf_hub_download
except ImportError as exc: # pragma: no cover
raise RuntimeError("huggingface_hub is required to download Hugging Face datasets") from exc
return Path(hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=filename))
def build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Build a normalized exercise knowledge base and Pozify card pack from a real dataset export."
)
parser.add_argument(
"--input",
help="Local path to a JSON or JSONL exercise dataset export.",
)
parser.add_argument(
"--hf-dataset",
default=DEFAULT_HF_DATASET,
help="Hugging Face dataset repo id to download when --input is omitted.",
)
parser.add_argument(
"--hf-filename",
default=DEFAULT_HF_FILENAME,
help="Filename inside the Hugging Face dataset repo to download when --input is omitted.",
)
parser.add_argument(
"--normalized-output",
default=str(ROOT / "data/knowledge/exercises.json"),
help="Path for the normalized exercise-schema export.",
)
parser.add_argument(
"--card-pack-output",
default=str(ROOT / "data/knowledge_cards/external_exercise_dataset_pack.json"),
help="Path for the generated Pozify card-pack JSON.",
)
parser.add_argument(
"--source-dataset",
help="Optional source identifier to record in metadata. Defaults to the local file path or HF dataset id.",
)
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_arg_parser()
args = parser.parse_args(argv)
if args.input:
input_path = Path(args.input).expanduser().resolve()
source_dataset = args.source_dataset or str(input_path)
else:
input_path = _download_hf_dataset_file(args.hf_dataset, args.hf_filename)
source_dataset = args.source_dataset or args.hf_dataset
normalized = write_normalized_exercises(
input_path=input_path,
output_path=Path(args.normalized_output),
source_dataset=source_dataset,
)
pack = write_card_pack(
input_path=input_path,
output_path=Path(args.card_pack_output),
source_dataset=source_dataset,
)
print(
"Built exercise knowledge base",
{
"source_dataset": source_dataset,
"input_path": str(input_path),
"normalized_output": args.normalized_output,
"card_pack_output": args.card_pack_output,
"exercise_count": normalized["exercise_count"],
"card_count": pack["card_count"],
},
)
return 0
if __name__ == "__main__":
raise SystemExit(main())