Spaces:

XvLOSS
/

voice-admissions-agent

Runtime error

voice-admissions-agent / scripts /build_ppmb_qa_pairs.py

xvious5

feat: training scripts, edge-tts, Gradio TTS choice, NLP warmup

c478197 28 days ago

4.24 kB

	#!/usr/bin/env python3
	"""
	Build qa_pairs-style JSON from the public PPMB QA dataset on Hugging Face.

	Usage (from project root):

	python scripts/build_ppmb_qa_pairs.py --lang en --out data/ppmb_qa_pairs_en.json
	python scripts/build_ppmb_qa_pairs.py --lang id --out data/ppmb_qa_pairs_id.json

	This expects the dataset `suryaadhi/ppmb-qa-dataset` to provide fields like
	`question_en`, `answer_en`, `question_id`, `answer_id`, etc. Adjust the field
	names below if the upstream dataset schema changes.
	"""

	from __future__ import annotations

	import argparse
	import json
	from pathlib import Path

	from datasets import load_dataset


	def _guess_fields(ds, lang: str) -> tuple[str, str]:
	"""
	Try to guess which columns contain question/answer text.

	Strategy:
	- Prefer exact matches like question_<lang>, answer_<lang>.
	- Otherwise, look for any column containing 'question' / 'answer'.
	"""
	cols = list(ds.column_names)

	# 0) Known schema for this dataset (current version uses generic names)
	if "query" in cols and "answer" in cols:
	print("[build_ppmb_qa_pairs] Using columns: question='query', answer='answer'")
	return "query", "answer"

	# 1) Exact expected names
	q_field = f"question_{lang}"
	a_field = f"answer_{lang}"
	if q_field in cols and a_field in cols:
	return q_field, a_field

	# 2) Fallback: any column with "question"/"answer" in its name
	q_candidates = [c for c in cols if "question" in c.lower()]
	a_candidates = [c for c in cols if "answer" in c.lower()]

	if len(q_candidates) == 1 and len(a_candidates) == 1:
	print(
	f"[build_ppmb_qa_pairs] Using inferred columns: "
	f"question='{q_candidates[0]}', answer='{a_candidates[0]}'"
	)
	return q_candidates[0], a_candidates[0]

	# 3) Give up with a helpful error
	raise SystemExit(
	"Could not determine question/answer fields in suryaadhi/ppmb-qa-dataset.\n"
	f"Available columns: {cols}\n"
	"Expected e.g. 'question_en' / 'answer_en'. "
	"If the schema is different, please edit scripts/build_ppmb_qa_pairs.py "
	"to map the correct column names."
	)


	def build_pairs(lang: str) -> dict:
	ds = load_dataset("suryaadhi/ppmb-qa-dataset")["train"]

	q_field, a_field = _guess_fields(ds, lang)

	pairs: list[dict[str, str]] = []
	skipped = 0

	for row in ds:
	q = (row.get(q_field) or "").strip()
	a = (row.get(a_field) or "").strip()
	if not q or not a:
	skipped += 1
	continue
	pairs.append({"q": q, "a": a})

	capability = (
	"I am a research prototype trained first on a public PPMB Q&A dataset "
	"about admissions at UPN 'Veteran' Jawa Timur (Indonesian context), then "
	"further adapted to a small curated knowledge base. I am not the official "
	"admissions office—please verify deadlines, quotas, and requirements on "
	"the university's official channels."
	)

	return {
	"capability_statement": capability,
	"pairs": pairs,
	"_meta": {
	"language": lang,
	"source_dataset": "suryaadhi/ppmb-qa-dataset",
	"skipped_rows_without_text": skipped,
	"total_pairs": len(pairs),
	},
	}


	def main() -> None:
	ap = argparse.ArgumentParser(
	description="Convert suryaadhi/ppmb-qa-dataset into qa_pairs-style JSON."
	)
	ap.add_argument(
	"--lang",
	default="en",
	help="Language suffix to use, e.g. 'en', 'id', or 'jv' (default: en).",
	)
	ap.add_argument(
	"--out",
	type=Path,
	default=Path("data") / "ppmb_qa_pairs_en.json",
	help="Output JSON path (default: data/ppmb_qa_pairs_en.json).",
	)
	args = ap.parse_args()

	data = build_pairs(args.lang)
	args.out.parent.mkdir(parents=True, exist_ok=True)
	args.out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

	print(
	f"Wrote {len(data['pairs'])} pairs "
	f"(lang={args.lang}, skipped={data.get('_meta', {}).get('skipped_rows_without_text', 0)}) "
	f"to {args.out}"
	)


	if __name__ == "__main__":
	main()