Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /scripts /collect_wikimedia_thai.py
| #!/usr/bin/env python | |
| """ | |
| Download and extract a sample from Thai Wikimedia XML dumps. | |
| This is NOT part of the public-domain/CC0 pure core. Wikimedia text is generally | |
| CC BY-SA/GFDL and must remain in a separate open-licensed corpus with attribution | |
| tracking. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import bz2 | |
| import hashlib | |
| import json | |
| import re | |
| import urllib.request | |
| import xml.etree.ElementTree as ET | |
| from pathlib import Path | |
| PROJECTS = { | |
| "thwiki": "https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2", | |
| "thwikisource": "https://dumps.wikimedia.org/thwikisource/latest/thwikisource-latest-pages-articles.xml.bz2", | |
| } | |
| def sha256_text(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest() | |
| def clean_wikitext(text: str) -> str: | |
| text = re.sub(r"<!--.*?-->", " ", text, flags=re.DOTALL) | |
| text = re.sub(r"\{\{.*?\}\}", " ", text, flags=re.DOTALL) | |
| text = re.sub(r"\[\[File:.*?\]\]", " ", text, flags=re.IGNORECASE | re.DOTALL) | |
| text = re.sub(r"\[\[Image:.*?\]\]", " ", text, flags=re.IGNORECASE | re.DOTALL) | |
| text = re.sub(r"\[\[(?:[^|\]]*\|)?([^\]]+)\]\]", r"\1", text) | |
| text = re.sub(r"\[https?://[^\s\]]+\s*([^\]]*)\]", r"\1", text) | |
| text = re.sub(r"'{2,}", "", text) | |
| text = re.sub(r"<ref.*?</ref>", " ", text, flags=re.IGNORECASE | re.DOTALL) | |
| text = re.sub(r"<.*?>", " ", text) | |
| text = re.sub(r"={2,}\s*(.*?)\s*={2,}", r"\n\1\n", text) | |
| text = re.sub(r"[ \t]+", " ", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text.strip() | |
| def download(url: str, path: Path) -> None: | |
| if path.exists() and path.stat().st_size > 0: | |
| return | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with urllib.request.urlopen(url, timeout=120) as response, path.open("wb") as f: | |
| while True: | |
| chunk = response.read(1024 * 1024) | |
| if not chunk: | |
| break | |
| f.write(chunk) | |
| def iter_pages(path: Path): | |
| ns = "{http://www.mediawiki.org/xml/export-0.11/}" | |
| with bz2.open(path, "rb") as f: | |
| context = ET.iterparse(f, events=("end",)) | |
| for _, elem in context: | |
| if elem.tag != f"{ns}page": | |
| continue | |
| title = elem.findtext(f"{ns}title") or "" | |
| namespace = elem.findtext(f"{ns}ns") or "" | |
| revision = elem.find(f"{ns}revision") | |
| text = "" | |
| if revision is not None: | |
| text = revision.findtext(f"{ns}text") or "" | |
| elem.clear() | |
| if namespace == "0" and text: | |
| yield title, text | |
| def main() -> int: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--root", default=r"D:\ad\tinymind\data") | |
| parser.add_argument("--project", choices=PROJECTS.keys(), default="thwiki") | |
| parser.add_argument("--limit-pages", type=int, default=1000) | |
| parser.add_argument("--min-chars", type=int, default=700) | |
| args = parser.parse_args() | |
| root = Path(args.root) | |
| raw_dir = root / "raw" / args.project | |
| jsonl_dir = root / "jsonl" | |
| manifest_dir = root / "manifests" | |
| for folder in [raw_dir, jsonl_dir, manifest_dir]: | |
| folder.mkdir(parents=True, exist_ok=True) | |
| dump_path = raw_dir / f"{args.project}-latest-pages-articles.xml.bz2" | |
| download(PROJECTS[args.project], dump_path) | |
| out_jsonl = jsonl_dir / f"{args.project}_open_sample.jsonl" | |
| out_manifest = manifest_dir / f"{args.project}_open_manifest.jsonl" | |
| seen = set() | |
| accepted = 0 | |
| scanned = 0 | |
| with out_jsonl.open("w", encoding="utf-8") as data_f, out_manifest.open("w", encoding="utf-8") as manifest_f: | |
| for title, wiki_text in iter_pages(dump_path): | |
| scanned += 1 | |
| clean = clean_wikitext(wiki_text) | |
| if len(clean) < args.min_chars: | |
| continue | |
| digest = sha256_text(clean) | |
| if digest in seen: | |
| continue | |
| seen.add(digest) | |
| record = { | |
| "id": f"{args.project}:{title}", | |
| "source": f"Wikimedia {args.project} dump", | |
| "source_url": PROJECTS[args.project], | |
| "license_family": "cc-by-sa/gfdl", | |
| "title": title, | |
| "language": "th", | |
| "content_sha256": digest, | |
| "text": clean, | |
| } | |
| manifest = {k: v for k, v in record.items() if k != "text"} | |
| data_f.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| manifest_f.write(json.dumps(manifest, ensure_ascii=False) + "\n") | |
| accepted += 1 | |
| if accepted >= args.limit_pages: | |
| break | |
| report = { | |
| "project": args.project, | |
| "scanned": scanned, | |
| "accepted": accepted, | |
| "jsonl": str(out_jsonl), | |
| "manifest": str(out_manifest), | |
| "note": "Open-licensed Thai corpus, not public-domain/CC0 pure core.", | |
| } | |
| (manifest_dir / f"{args.project}_open_quality_report.json").write_text( | |
| json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8" | |
| ) | |
| print(json.dumps(report, indent=2, ensure_ascii=False)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |
Xet Storage Details
- Size:
- 5.22 kB
- Xet hash:
- 73d8c56a2336800317b0fc7272748fe8822a01c3e35fe8d84615eedf3f1a6025
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.