bbkdevops's picture
download
raw
5.79 kB
"""
Phase 1A: Collect Thai + English QA datasets from open sources.
Sources: iapp-wiki-qa, thaiqa_squad, SQuAD, OpenHermes, TriviaQA
"""
import json
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm
RAW_DIR = Path(__file__).parent / "raw"
RAW_DIR.mkdir(exist_ok=True)
def save_jsonl(data: list[dict], path: Path):
with open(path, "w", encoding="utf-8") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f" saved {len(data):,} items → {path}")
# ─── Thai Datasets ────────────────────────────────────────────────────────────
def collect_iapp_wiki_qa() -> list[dict]:
"""iapp-ai/iapp_wiki_qa_squad — Thai QA from Wikipedia"""
print("[Thai] iapp_wiki_qa_squad ...")
ds = load_dataset("iapp-ai/iapp_wiki_qa_squad", split="train", trust_remote_code=True)
out: list[dict] = []
for row in tqdm(ds):
row = dict(row)
qas = row.get("qas") or []
if not isinstance(qas, list):
continue
for qa in qas:
qa = dict(qa)
if qa.get("is_impossible"):
continue
answers = qa.get("answers") or []
if not answers:
continue
out.append({
"lang": "th",
"source": "iapp_wiki_qa",
"context": str(row.get("context", "")),
"question": str(qa.get("question", "")),
"answer": str(dict(answers[0]).get("text", "")),
})
save_jsonl(out, RAW_DIR / "thai_iapp_wiki_qa.jsonl")
return out
def collect_thaiqa() -> list[dict]:
"""pythainlp/thaiqa_squad — Thai QA (SQuAD style)"""
print("[Thai] thaiqa_squad ...")
ds = load_dataset("pythainlp/thaiqa_squad", split="train", trust_remote_code=True)
out: list[dict] = []
for row in tqdm(ds):
row = dict(row)
answers = row.get("answers") or {}
texts = dict(answers).get("text") or []
if not texts:
continue
out.append({
"lang": "th",
"source": "thaiqa_squad",
"context": str(row.get("context", "")),
"question": str(row.get("question", "")),
"answer": str(texts[0]),
})
save_jsonl(out, RAW_DIR / "thai_thaiqa_squad.jsonl")
return out
# ─── English Datasets ─────────────────────────────────────────────────────────
def collect_squad() -> list[dict]:
"""SQuAD v2 — English QA"""
print("[EN] SQuAD v2 ...")
ds = load_dataset("rajpurkar/squad_v2", split="train")
out: list[dict] = []
for row in tqdm(ds):
row = dict(row)
answers = dict(row.get("answers") or {})
texts = answers.get("text") or []
if not texts:
continue
out.append({
"lang": "en",
"source": "squad_v2",
"context": str(row.get("context", "")),
"question": str(row.get("question", "")),
"answer": str(texts[0]),
})
save_jsonl(out, RAW_DIR / "en_squad_v2.jsonl")
return out
def collect_openhermes() -> list[dict]:
"""teknium/OpenHermes-2.5 — high-quality instruction/chat pairs"""
print("[EN] OpenHermes-2.5 (first 50k) ...")
ds = load_dataset("teknium/OpenHermes-2.5", split="train", streaming=True)
out: list[dict] = []
for row in tqdm(ds, total=50_000):
if len(out) >= 50_000:
break
row = dict(row)
convs = row.get("conversations") or []
for i in range(len(convs) - 1):
a, b = dict(convs[i]), dict(convs[i + 1])
if a.get("from") == "human" and b.get("from") == "gpt":
out.append({
"lang": "en",
"source": "openhermes",
"context": "",
"question": str(a.get("value", "")),
"answer": str(b.get("value", "")),
})
save_jsonl(out, RAW_DIR / "en_openhermes.jsonl")
return out
def collect_triviaqa() -> list[dict]:
"""mandarjoshi/trivia_qa — factual QA"""
print("[EN] TriviaQA ...")
ds = load_dataset("mandarjoshi/trivia_qa", "rc", split="train")
out: list[dict] = []
for row in tqdm(ds):
row = dict(row)
answer = dict(row.get("answer") or {})
aliases = answer.get("aliases") or [answer.get("value", "")]
if not aliases or not aliases[0]:
continue
out.append({
"lang": "en",
"source": "triviaqa",
"context": "",
"question": str(row.get("question", "")),
"answer": str(aliases[0]),
})
save_jsonl(out, RAW_DIR / "en_triviaqa.jsonl")
return out
# ─── Main ─────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=" * 60)
print("TinyMind — Data Collection Phase 1A")
print("=" * 60)
all_data: list[dict] = []
all_data += collect_iapp_wiki_qa()
all_data += collect_thaiqa()
all_data += collect_squad()
all_data += collect_openhermes()
all_data += collect_triviaqa()
save_jsonl(all_data, RAW_DIR / "all_raw.jsonl")
th = sum(1 for d in all_data if d["lang"] == "th")
en = sum(1 for d in all_data if d["lang"] == "en")
print(f"\nTotal: {len(all_data):,} pairs (Thai: {th:,} | EN: {en:,})")

Xet Storage Details

Size:
5.79 kB
·
Xet hash:
be1a6b07e93e64ba46bc2c3de75f757097b79ee967200f7304e48216c76ce407

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.