Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /scripts /collect_wikimedia_thai.py

bbkdevops

about 1 month ago

download

raw

5.22 kB

	#!/usr/bin/env python
	"""
	Download and extract a sample from Thai Wikimedia XML dumps.

	This is NOT part of the public-domain/CC0 pure core. Wikimedia text is generally
	CC BY-SA/GFDL and must remain in a separate open-licensed corpus with attribution
	tracking.
	"""

	from __future__ import annotations

	import argparse
	import bz2
	import hashlib
	import json
	import re
	import urllib.request
	import xml.etree.ElementTree as ET
	from pathlib import Path


	PROJECTS = {
	"thwiki": "https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2",
	"thwikisource": "https://dumps.wikimedia.org/thwikisource/latest/thwikisource-latest-pages-articles.xml.bz2",
	}


	def sha256_text(text: str) -> str:
	return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()


	def clean_wikitext(text: str) -> str:
	text = re.sub(r"<!--.*?-->", " ", text, flags=re.DOTALL)
	text = re.sub(r"\{\{.*?\}\}", " ", text, flags=re.DOTALL)
	text = re.sub(r"\[\[File:.*?\]\]", " ", text, flags=re.IGNORECASE \| re.DOTALL)
	text = re.sub(r"\[\[Image:.*?\]\]", " ", text, flags=re.IGNORECASE \| re.DOTALL)
	text = re.sub(r"\[\[(?:[^\|\]]*\\|)?([^\]]+)\]\]", r"\1", text)
	text = re.sub(r"\[https?://[^\s\]]+\s([^\]])\]", r"\1", text)
	text = re.sub(r"'{2,}", "", text)
	text = re.sub(r"<ref.*?</ref>", " ", text, flags=re.IGNORECASE \| re.DOTALL)
	text = re.sub(r"<.*?>", " ", text)
	text = re.sub(r"={2,}\s(.?)\s*={2,}", r"\n\1\n", text)
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()


	def download(url: str, path: Path) -> None:
	if path.exists() and path.stat().st_size > 0:
	return
	path.parent.mkdir(parents=True, exist_ok=True)
	with urllib.request.urlopen(url, timeout=120) as response, path.open("wb") as f:
	while True:
	chunk = response.read(1024 * 1024)
	if not chunk:
	break
	f.write(chunk)


	def iter_pages(path: Path):
	ns = "{http://www.mediawiki.org/xml/export-0.11/}"
	with bz2.open(path, "rb") as f:
	context = ET.iterparse(f, events=("end",))
	for _, elem in context:
	if elem.tag != f"{ns}page":
	continue
	title = elem.findtext(f"{ns}title") or ""
	namespace = elem.findtext(f"{ns}ns") or ""
	revision = elem.find(f"{ns}revision")
	text = ""
	if revision is not None:
	text = revision.findtext(f"{ns}text") or ""
	elem.clear()
	if namespace == "0" and text:
	yield title, text


	def main() -> int:
	parser = argparse.ArgumentParser()
	parser.add_argument("--root", default=r"D:\ad\tinymind\data")
	parser.add_argument("--project", choices=PROJECTS.keys(), default="thwiki")
	parser.add_argument("--limit-pages", type=int, default=1000)
	parser.add_argument("--min-chars", type=int, default=700)
	args = parser.parse_args()

	root = Path(args.root)
	raw_dir = root / "raw" / args.project
	jsonl_dir = root / "jsonl"
	manifest_dir = root / "manifests"
	for folder in [raw_dir, jsonl_dir, manifest_dir]:
	folder.mkdir(parents=True, exist_ok=True)

	dump_path = raw_dir / f"{args.project}-latest-pages-articles.xml.bz2"
	download(PROJECTS[args.project], dump_path)

	out_jsonl = jsonl_dir / f"{args.project}_open_sample.jsonl"
	out_manifest = manifest_dir / f"{args.project}_open_manifest.jsonl"

	seen = set()
	accepted = 0
	scanned = 0
	with out_jsonl.open("w", encoding="utf-8") as data_f, out_manifest.open("w", encoding="utf-8") as manifest_f:
	for title, wiki_text in iter_pages(dump_path):
	scanned += 1
	clean = clean_wikitext(wiki_text)
	if len(clean) < args.min_chars:
	continue
	digest = sha256_text(clean)
	if digest in seen:
	continue
	seen.add(digest)
	record = {
	"id": f"{args.project}:{title}",
	"source": f"Wikimedia {args.project} dump",
	"source_url": PROJECTS[args.project],
	"license_family": "cc-by-sa/gfdl",
	"title": title,
	"language": "th",
	"content_sha256": digest,
	"text": clean,
	}
	manifest = {k: v for k, v in record.items() if k != "text"}
	data_f.write(json.dumps(record, ensure_ascii=False) + "\n")
	manifest_f.write(json.dumps(manifest, ensure_ascii=False) + "\n")
	accepted += 1
	if accepted >= args.limit_pages:
	break

	report = {
	"project": args.project,
	"scanned": scanned,
	"accepted": accepted,
	"jsonl": str(out_jsonl),
	"manifest": str(out_manifest),
	"note": "Open-licensed Thai corpus, not public-domain/CC0 pure core.",
	}
	(manifest_dir / f"{args.project}_open_quality_report.json").write_text(
	json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8"
	)
	print(json.dumps(report, indent=2, ensure_ascii=False))
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())

Xet Storage Details

Size:: 5.22 kB
Xet hash:: 73d8c56a2336800317b0fc7272748fe8822a01c3e35fe8d84615eedf3f1a6025

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.