tinymind-native-8b-remote-handoff/bundle/data/collect.py · bbkdevops/unicosys-hypergraph-bucket

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /collect.py

5.79 kB

	"""
	Phase 1A: Collect Thai + English QA datasets from open sources.
	Sources: iapp-wiki-qa, thaiqa_squad, SQuAD, OpenHermes, TriviaQA
	"""

	import json
	from pathlib import Path
	from datasets import load_dataset
	from tqdm import tqdm

	RAW_DIR = Path(__file__).parent / "raw"
	RAW_DIR.mkdir(exist_ok=True)


	def save_jsonl(data: list[dict], path: Path):
	with open(path, "w", encoding="utf-8") as f:
	for item in data:
	f.write(json.dumps(item, ensure_ascii=False) + "\n")
	print(f" saved {len(data):,} items → {path}")


	# ─── Thai Datasets ────────────────────────────────────────────────────────────

	def collect_iapp_wiki_qa() -> list[dict]:
	"""iapp-ai/iapp_wiki_qa_squad — Thai QA from Wikipedia"""
	print("[Thai] iapp_wiki_qa_squad ...")
	ds = load_dataset("iapp-ai/iapp_wiki_qa_squad", split="train", trust_remote_code=True)
	out: list[dict] = []
	for row in tqdm(ds):
	row = dict(row)
	qas = row.get("qas") or []
	if not isinstance(qas, list):
	continue
	for qa in qas:
	qa = dict(qa)
	if qa.get("is_impossible"):
	continue
	answers = qa.get("answers") or []
	if not answers:
	continue
	out.append({
	"lang": "th",
	"source": "iapp_wiki_qa",
	"context": str(row.get("context", "")),
	"question": str(qa.get("question", "")),
	"answer": str(dict(answers[0]).get("text", "")),
	})
	save_jsonl(out, RAW_DIR / "thai_iapp_wiki_qa.jsonl")
	return out


	def collect_thaiqa() -> list[dict]:
	"""pythainlp/thaiqa_squad — Thai QA (SQuAD style)"""
	print("[Thai] thaiqa_squad ...")
	ds = load_dataset("pythainlp/thaiqa_squad", split="train", trust_remote_code=True)
	out: list[dict] = []
	for row in tqdm(ds):
	row = dict(row)
	answers = row.get("answers") or {}
	texts = dict(answers).get("text") or []
	if not texts:
	continue
	out.append({
	"lang": "th",
	"source": "thaiqa_squad",
	"context": str(row.get("context", "")),
	"question": str(row.get("question", "")),
	"answer": str(texts[0]),
	})
	save_jsonl(out, RAW_DIR / "thai_thaiqa_squad.jsonl")
	return out


	# ─── English Datasets ─────────────────────────────────────────────────────────

	def collect_squad() -> list[dict]:
	"""SQuAD v2 — English QA"""
	print("[EN] SQuAD v2 ...")
	ds = load_dataset("rajpurkar/squad_v2", split="train")
	out: list[dict] = []
	for row in tqdm(ds):
	row = dict(row)
	answers = dict(row.get("answers") or {})
	texts = answers.get("text") or []
	if not texts:
	continue
	out.append({
	"lang": "en",
	"source": "squad_v2",
	"context": str(row.get("context", "")),
	"question": str(row.get("question", "")),
	"answer": str(texts[0]),
	})
	save_jsonl(out, RAW_DIR / "en_squad_v2.jsonl")
	return out


	def collect_openhermes() -> list[dict]:
	"""teknium/OpenHermes-2.5 — high-quality instruction/chat pairs"""
	print("[EN] OpenHermes-2.5 (first 50k) ...")
	ds = load_dataset("teknium/OpenHermes-2.5", split="train", streaming=True)
	out: list[dict] = []
	for row in tqdm(ds, total=50_000):
	if len(out) >= 50_000:
	break
	row = dict(row)
	convs = row.get("conversations") or []
	for i in range(len(convs) - 1):
	a, b = dict(convs[i]), dict(convs[i + 1])
	if a.get("from") == "human" and b.get("from") == "gpt":
	out.append({
	"lang": "en",
	"source": "openhermes",
	"context": "",
	"question": str(a.get("value", "")),
	"answer": str(b.get("value", "")),
	})
	save_jsonl(out, RAW_DIR / "en_openhermes.jsonl")
	return out


	def collect_triviaqa() -> list[dict]:
	"""mandarjoshi/trivia_qa — factual QA"""
	print("[EN] TriviaQA ...")
	ds = load_dataset("mandarjoshi/trivia_qa", "rc", split="train")
	out: list[dict] = []
	for row in tqdm(ds):
	row = dict(row)
	answer = dict(row.get("answer") or {})
	aliases = answer.get("aliases") or [answer.get("value", "")]
	if not aliases or not aliases[0]:
	continue
	out.append({
	"lang": "en",
	"source": "triviaqa",
	"context": "",
	"question": str(row.get("question", "")),
	"answer": str(aliases[0]),
	})
	save_jsonl(out, RAW_DIR / "en_triviaqa.jsonl")
	return out


	# ─── Main ─────────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	print("=" * 60)
	print("TinyMind — Data Collection Phase 1A")
	print("=" * 60)

	all_data: list[dict] = []
	all_data += collect_iapp_wiki_qa()
	all_data += collect_thaiqa()
	all_data += collect_squad()
	all_data += collect_openhermes()
	all_data += collect_triviaqa()

	save_jsonl(all_data, RAW_DIR / "all_raw.jsonl")
	th = sum(1 for d in all_data if d["lang"] == "th")
	en = sum(1 for d in all_data if d["lang"] == "en")
	print(f"\nTotal: {len(all_data):,} pairs (Thai: {th:,} \| EN: {en:,})")

Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Xet Storage Details