Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Build qa_pairs-style JSON from the public PPMB QA dataset on Hugging Face. | |
| Usage (from project root): | |
| python scripts/build_ppmb_qa_pairs.py --lang en --out data/ppmb_qa_pairs_en.json | |
| python scripts/build_ppmb_qa_pairs.py --lang id --out data/ppmb_qa_pairs_id.json | |
| This expects the dataset `suryaadhi/ppmb-qa-dataset` to provide fields like | |
| `question_en`, `answer_en`, `question_id`, `answer_id`, etc. Adjust the field | |
| names below if the upstream dataset schema changes. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from pathlib import Path | |
| from datasets import load_dataset | |
| def _guess_fields(ds, lang: str) -> tuple[str, str]: | |
| """ | |
| Try to guess which columns contain question/answer text. | |
| Strategy: | |
| - Prefer exact matches like question_<lang>, answer_<lang>. | |
| - Otherwise, look for any column containing 'question' / 'answer'. | |
| """ | |
| cols = list(ds.column_names) | |
| # 0) Known schema for this dataset (current version uses generic names) | |
| if "query" in cols and "answer" in cols: | |
| print("[build_ppmb_qa_pairs] Using columns: question='query', answer='answer'") | |
| return "query", "answer" | |
| # 1) Exact expected names | |
| q_field = f"question_{lang}" | |
| a_field = f"answer_{lang}" | |
| if q_field in cols and a_field in cols: | |
| return q_field, a_field | |
| # 2) Fallback: any column with "question"/"answer" in its name | |
| q_candidates = [c for c in cols if "question" in c.lower()] | |
| a_candidates = [c for c in cols if "answer" in c.lower()] | |
| if len(q_candidates) == 1 and len(a_candidates) == 1: | |
| print( | |
| f"[build_ppmb_qa_pairs] Using inferred columns: " | |
| f"question='{q_candidates[0]}', answer='{a_candidates[0]}'" | |
| ) | |
| return q_candidates[0], a_candidates[0] | |
| # 3) Give up with a helpful error | |
| raise SystemExit( | |
| "Could not determine question/answer fields in suryaadhi/ppmb-qa-dataset.\n" | |
| f"Available columns: {cols}\n" | |
| "Expected e.g. 'question_en' / 'answer_en'. " | |
| "If the schema is different, please edit scripts/build_ppmb_qa_pairs.py " | |
| "to map the correct column names." | |
| ) | |
| def build_pairs(lang: str) -> dict: | |
| ds = load_dataset("suryaadhi/ppmb-qa-dataset")["train"] | |
| q_field, a_field = _guess_fields(ds, lang) | |
| pairs: list[dict[str, str]] = [] | |
| skipped = 0 | |
| for row in ds: | |
| q = (row.get(q_field) or "").strip() | |
| a = (row.get(a_field) or "").strip() | |
| if not q or not a: | |
| skipped += 1 | |
| continue | |
| pairs.append({"q": q, "a": a}) | |
| capability = ( | |
| "I am a research prototype trained first on a public PPMB Q&A dataset " | |
| "about admissions at UPN 'Veteran' Jawa Timur (Indonesian context), then " | |
| "further adapted to a small curated knowledge base. I am not the official " | |
| "admissions office—please verify deadlines, quotas, and requirements on " | |
| "the university's official channels." | |
| ) | |
| return { | |
| "capability_statement": capability, | |
| "pairs": pairs, | |
| "_meta": { | |
| "language": lang, | |
| "source_dataset": "suryaadhi/ppmb-qa-dataset", | |
| "skipped_rows_without_text": skipped, | |
| "total_pairs": len(pairs), | |
| }, | |
| } | |
| def main() -> None: | |
| ap = argparse.ArgumentParser( | |
| description="Convert suryaadhi/ppmb-qa-dataset into qa_pairs-style JSON." | |
| ) | |
| ap.add_argument( | |
| "--lang", | |
| default="en", | |
| help="Language suffix to use, e.g. 'en', 'id', or 'jv' (default: en).", | |
| ) | |
| ap.add_argument( | |
| "--out", | |
| type=Path, | |
| default=Path("data") / "ppmb_qa_pairs_en.json", | |
| help="Output JSON path (default: data/ppmb_qa_pairs_en.json).", | |
| ) | |
| args = ap.parse_args() | |
| data = build_pairs(args.lang) | |
| args.out.parent.mkdir(parents=True, exist_ok=True) | |
| args.out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") | |
| print( | |
| f"Wrote {len(data['pairs'])} pairs " | |
| f"(lang={args.lang}, skipped={data.get('_meta', {}).get('skipped_rows_without_text', 0)}) " | |
| f"to {args.out}" | |
| ) | |
| if __name__ == "__main__": | |
| main() | |