voice-admissions-agent / scripts /build_ppmb_qa_pairs.py
xvious5
feat: training scripts, edge-tts, Gradio TTS choice, NLP warmup
c478197
#!/usr/bin/env python3
"""
Build qa_pairs-style JSON from the public PPMB QA dataset on Hugging Face.
Usage (from project root):
python scripts/build_ppmb_qa_pairs.py --lang en --out data/ppmb_qa_pairs_en.json
python scripts/build_ppmb_qa_pairs.py --lang id --out data/ppmb_qa_pairs_id.json
This expects the dataset `suryaadhi/ppmb-qa-dataset` to provide fields like
`question_en`, `answer_en`, `question_id`, `answer_id`, etc. Adjust the field
names below if the upstream dataset schema changes.
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from datasets import load_dataset
def _guess_fields(ds, lang: str) -> tuple[str, str]:
"""
Try to guess which columns contain question/answer text.
Strategy:
- Prefer exact matches like question_<lang>, answer_<lang>.
- Otherwise, look for any column containing 'question' / 'answer'.
"""
cols = list(ds.column_names)
# 0) Known schema for this dataset (current version uses generic names)
if "query" in cols and "answer" in cols:
print("[build_ppmb_qa_pairs] Using columns: question='query', answer='answer'")
return "query", "answer"
# 1) Exact expected names
q_field = f"question_{lang}"
a_field = f"answer_{lang}"
if q_field in cols and a_field in cols:
return q_field, a_field
# 2) Fallback: any column with "question"/"answer" in its name
q_candidates = [c for c in cols if "question" in c.lower()]
a_candidates = [c for c in cols if "answer" in c.lower()]
if len(q_candidates) == 1 and len(a_candidates) == 1:
print(
f"[build_ppmb_qa_pairs] Using inferred columns: "
f"question='{q_candidates[0]}', answer='{a_candidates[0]}'"
)
return q_candidates[0], a_candidates[0]
# 3) Give up with a helpful error
raise SystemExit(
"Could not determine question/answer fields in suryaadhi/ppmb-qa-dataset.\n"
f"Available columns: {cols}\n"
"Expected e.g. 'question_en' / 'answer_en'. "
"If the schema is different, please edit scripts/build_ppmb_qa_pairs.py "
"to map the correct column names."
)
def build_pairs(lang: str) -> dict:
ds = load_dataset("suryaadhi/ppmb-qa-dataset")["train"]
q_field, a_field = _guess_fields(ds, lang)
pairs: list[dict[str, str]] = []
skipped = 0
for row in ds:
q = (row.get(q_field) or "").strip()
a = (row.get(a_field) or "").strip()
if not q or not a:
skipped += 1
continue
pairs.append({"q": q, "a": a})
capability = (
"I am a research prototype trained first on a public PPMB Q&A dataset "
"about admissions at UPN 'Veteran' Jawa Timur (Indonesian context), then "
"further adapted to a small curated knowledge base. I am not the official "
"admissions office—please verify deadlines, quotas, and requirements on "
"the university's official channels."
)
return {
"capability_statement": capability,
"pairs": pairs,
"_meta": {
"language": lang,
"source_dataset": "suryaadhi/ppmb-qa-dataset",
"skipped_rows_without_text": skipped,
"total_pairs": len(pairs),
},
}
def main() -> None:
ap = argparse.ArgumentParser(
description="Convert suryaadhi/ppmb-qa-dataset into qa_pairs-style JSON."
)
ap.add_argument(
"--lang",
default="en",
help="Language suffix to use, e.g. 'en', 'id', or 'jv' (default: en).",
)
ap.add_argument(
"--out",
type=Path,
default=Path("data") / "ppmb_qa_pairs_en.json",
help="Output JSON path (default: data/ppmb_qa_pairs_en.json).",
)
args = ap.parse_args()
data = build_pairs(args.lang)
args.out.parent.mkdir(parents=True, exist_ok=True)
args.out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(
f"Wrote {len(data['pairs'])} pairs "
f"(lang={args.lang}, skipped={data.get('_meta', {}).get('skipped_rows_without_text', 0)}) "
f"to {args.out}"
)
if __name__ == "__main__":
main()