Spaces:
Running on Zero
Running on Zero
File size: 3,245 Bytes
7294c15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | from __future__ import annotations
import argparse
from pathlib import Path
import sys
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT / "src"))
from pozify.knowledge_card_dataset_transformer import ( # noqa: E402
write_card_pack,
write_normalized_exercises,
)
DEFAULT_HF_DATASET = "DORTROX/Exercises-Data"
DEFAULT_HF_FILENAME = "dataset.json"
def _download_hf_dataset_file(repo_id: str, filename: str) -> Path:
try:
from huggingface_hub import hf_hub_download
except ImportError as exc: # pragma: no cover
raise RuntimeError("huggingface_hub is required to download Hugging Face datasets") from exc
return Path(hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=filename))
def build_arg_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Build a normalized exercise knowledge base and Pozify card pack from a real dataset export."
)
parser.add_argument(
"--input",
help="Local path to a JSON or JSONL exercise dataset export.",
)
parser.add_argument(
"--hf-dataset",
default=DEFAULT_HF_DATASET,
help="Hugging Face dataset repo id to download when --input is omitted.",
)
parser.add_argument(
"--hf-filename",
default=DEFAULT_HF_FILENAME,
help="Filename inside the Hugging Face dataset repo to download when --input is omitted.",
)
parser.add_argument(
"--normalized-output",
default=str(ROOT / "data/knowledge/exercises.json"),
help="Path for the normalized exercise-schema export.",
)
parser.add_argument(
"--card-pack-output",
default=str(ROOT / "data/knowledge_cards/external_exercise_dataset_pack.json"),
help="Path for the generated Pozify card-pack JSON.",
)
parser.add_argument(
"--source-dataset",
help="Optional source identifier to record in metadata. Defaults to the local file path or HF dataset id.",
)
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_arg_parser()
args = parser.parse_args(argv)
if args.input:
input_path = Path(args.input).expanduser().resolve()
source_dataset = args.source_dataset or str(input_path)
else:
input_path = _download_hf_dataset_file(args.hf_dataset, args.hf_filename)
source_dataset = args.source_dataset or args.hf_dataset
normalized = write_normalized_exercises(
input_path=input_path,
output_path=Path(args.normalized_output),
source_dataset=source_dataset,
)
pack = write_card_pack(
input_path=input_path,
output_path=Path(args.card_pack_output),
source_dataset=source_dataset,
)
print(
"Built exercise knowledge base",
{
"source_dataset": source_dataset,
"input_path": str(input_path),
"normalized_output": args.normalized_output,
"card_pack_output": args.card_pack_output,
"exercise_count": normalized["exercise_count"],
"card_count": pack["card_count"],
},
)
return 0
if __name__ == "__main__":
raise SystemExit(main())
|