Buckets:
| """ | |
| Phase 1A: Collect Thai + English QA datasets from open sources. | |
| Sources: iapp-wiki-qa, thaiqa_squad, SQuAD, OpenHermes, TriviaQA | |
| """ | |
| import json | |
| from pathlib import Path | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| RAW_DIR = Path(__file__).parent / "raw" | |
| RAW_DIR.mkdir(exist_ok=True) | |
| def save_jsonl(data: list[dict], path: Path): | |
| with open(path, "w", encoding="utf-8") as f: | |
| for item in data: | |
| f.write(json.dumps(item, ensure_ascii=False) + "\n") | |
| print(f" saved {len(data):,} items → {path}") | |
| # ─── Thai Datasets ──────────────────────────────────────────────────────────── | |
| def collect_iapp_wiki_qa() -> list[dict]: | |
| """iapp-ai/iapp_wiki_qa_squad — Thai QA from Wikipedia""" | |
| print("[Thai] iapp_wiki_qa_squad ...") | |
| ds = load_dataset("iapp-ai/iapp_wiki_qa_squad", split="train", trust_remote_code=True) | |
| out: list[dict] = [] | |
| for row in tqdm(ds): | |
| row = dict(row) | |
| qas = row.get("qas") or [] | |
| if not isinstance(qas, list): | |
| continue | |
| for qa in qas: | |
| qa = dict(qa) | |
| if qa.get("is_impossible"): | |
| continue | |
| answers = qa.get("answers") or [] | |
| if not answers: | |
| continue | |
| out.append({ | |
| "lang": "th", | |
| "source": "iapp_wiki_qa", | |
| "context": str(row.get("context", "")), | |
| "question": str(qa.get("question", "")), | |
| "answer": str(dict(answers[0]).get("text", "")), | |
| }) | |
| save_jsonl(out, RAW_DIR / "thai_iapp_wiki_qa.jsonl") | |
| return out | |
| def collect_thaiqa() -> list[dict]: | |
| """pythainlp/thaiqa_squad — Thai QA (SQuAD style)""" | |
| print("[Thai] thaiqa_squad ...") | |
| ds = load_dataset("pythainlp/thaiqa_squad", split="train", trust_remote_code=True) | |
| out: list[dict] = [] | |
| for row in tqdm(ds): | |
| row = dict(row) | |
| answers = row.get("answers") or {} | |
| texts = dict(answers).get("text") or [] | |
| if not texts: | |
| continue | |
| out.append({ | |
| "lang": "th", | |
| "source": "thaiqa_squad", | |
| "context": str(row.get("context", "")), | |
| "question": str(row.get("question", "")), | |
| "answer": str(texts[0]), | |
| }) | |
| save_jsonl(out, RAW_DIR / "thai_thaiqa_squad.jsonl") | |
| return out | |
| # ─── English Datasets ───────────────────────────────────────────────────────── | |
| def collect_squad() -> list[dict]: | |
| """SQuAD v2 — English QA""" | |
| print("[EN] SQuAD v2 ...") | |
| ds = load_dataset("rajpurkar/squad_v2", split="train") | |
| out: list[dict] = [] | |
| for row in tqdm(ds): | |
| row = dict(row) | |
| answers = dict(row.get("answers") or {}) | |
| texts = answers.get("text") or [] | |
| if not texts: | |
| continue | |
| out.append({ | |
| "lang": "en", | |
| "source": "squad_v2", | |
| "context": str(row.get("context", "")), | |
| "question": str(row.get("question", "")), | |
| "answer": str(texts[0]), | |
| }) | |
| save_jsonl(out, RAW_DIR / "en_squad_v2.jsonl") | |
| return out | |
| def collect_openhermes() -> list[dict]: | |
| """teknium/OpenHermes-2.5 — high-quality instruction/chat pairs""" | |
| print("[EN] OpenHermes-2.5 (first 50k) ...") | |
| ds = load_dataset("teknium/OpenHermes-2.5", split="train", streaming=True) | |
| out: list[dict] = [] | |
| for row in tqdm(ds, total=50_000): | |
| if len(out) >= 50_000: | |
| break | |
| row = dict(row) | |
| convs = row.get("conversations") or [] | |
| for i in range(len(convs) - 1): | |
| a, b = dict(convs[i]), dict(convs[i + 1]) | |
| if a.get("from") == "human" and b.get("from") == "gpt": | |
| out.append({ | |
| "lang": "en", | |
| "source": "openhermes", | |
| "context": "", | |
| "question": str(a.get("value", "")), | |
| "answer": str(b.get("value", "")), | |
| }) | |
| save_jsonl(out, RAW_DIR / "en_openhermes.jsonl") | |
| return out | |
| def collect_triviaqa() -> list[dict]: | |
| """mandarjoshi/trivia_qa — factual QA""" | |
| print("[EN] TriviaQA ...") | |
| ds = load_dataset("mandarjoshi/trivia_qa", "rc", split="train") | |
| out: list[dict] = [] | |
| for row in tqdm(ds): | |
| row = dict(row) | |
| answer = dict(row.get("answer") or {}) | |
| aliases = answer.get("aliases") or [answer.get("value", "")] | |
| if not aliases or not aliases[0]: | |
| continue | |
| out.append({ | |
| "lang": "en", | |
| "source": "triviaqa", | |
| "context": "", | |
| "question": str(row.get("question", "")), | |
| "answer": str(aliases[0]), | |
| }) | |
| save_jsonl(out, RAW_DIR / "en_triviaqa.jsonl") | |
| return out | |
| # ─── Main ───────────────────────────────────────────────────────────────────── | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("TinyMind — Data Collection Phase 1A") | |
| print("=" * 60) | |
| all_data: list[dict] = [] | |
| all_data += collect_iapp_wiki_qa() | |
| all_data += collect_thaiqa() | |
| all_data += collect_squad() | |
| all_data += collect_openhermes() | |
| all_data += collect_triviaqa() | |
| save_jsonl(all_data, RAW_DIR / "all_raw.jsonl") | |
| th = sum(1 for d in all_data if d["lang"] == "th") | |
| en = sum(1 for d in all_data if d["lang"] == "en") | |
| print(f"\nTotal: {len(all_data):,} pairs (Thai: {th:,} | EN: {en:,})") | |
Xet Storage Details
- Size:
- 5.79 kB
- Xet hash:
- be1a6b07e93e64ba46bc2c3de75f757097b79ee967200f7304e48216c76ce407
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.