bbkdevops's picture
download
raw
5.22 kB
#!/usr/bin/env python
"""
Download and extract a sample from Thai Wikimedia XML dumps.
This is NOT part of the public-domain/CC0 pure core. Wikimedia text is generally
CC BY-SA/GFDL and must remain in a separate open-licensed corpus with attribution
tracking.
"""
from __future__ import annotations
import argparse
import bz2
import hashlib
import json
import re
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path
PROJECTS = {
"thwiki": "https://dumps.wikimedia.org/thwiki/latest/thwiki-latest-pages-articles.xml.bz2",
"thwikisource": "https://dumps.wikimedia.org/thwikisource/latest/thwikisource-latest-pages-articles.xml.bz2",
}
def sha256_text(text: str) -> str:
return hashlib.sha256(text.encode("utf-8", errors="ignore")).hexdigest()
def clean_wikitext(text: str) -> str:
text = re.sub(r"<!--.*?-->", " ", text, flags=re.DOTALL)
text = re.sub(r"\{\{.*?\}\}", " ", text, flags=re.DOTALL)
text = re.sub(r"\[\[File:.*?\]\]", " ", text, flags=re.IGNORECASE | re.DOTALL)
text = re.sub(r"\[\[Image:.*?\]\]", " ", text, flags=re.IGNORECASE | re.DOTALL)
text = re.sub(r"\[\[(?:[^|\]]*\|)?([^\]]+)\]\]", r"\1", text)
text = re.sub(r"\[https?://[^\s\]]+\s*([^\]]*)\]", r"\1", text)
text = re.sub(r"'{2,}", "", text)
text = re.sub(r"<ref.*?</ref>", " ", text, flags=re.IGNORECASE | re.DOTALL)
text = re.sub(r"<.*?>", " ", text)
text = re.sub(r"={2,}\s*(.*?)\s*={2,}", r"\n\1\n", text)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def download(url: str, path: Path) -> None:
if path.exists() and path.stat().st_size > 0:
return
path.parent.mkdir(parents=True, exist_ok=True)
with urllib.request.urlopen(url, timeout=120) as response, path.open("wb") as f:
while True:
chunk = response.read(1024 * 1024)
if not chunk:
break
f.write(chunk)
def iter_pages(path: Path):
ns = "{http://www.mediawiki.org/xml/export-0.11/}"
with bz2.open(path, "rb") as f:
context = ET.iterparse(f, events=("end",))
for _, elem in context:
if elem.tag != f"{ns}page":
continue
title = elem.findtext(f"{ns}title") or ""
namespace = elem.findtext(f"{ns}ns") or ""
revision = elem.find(f"{ns}revision")
text = ""
if revision is not None:
text = revision.findtext(f"{ns}text") or ""
elem.clear()
if namespace == "0" and text:
yield title, text
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--root", default=r"D:\ad\tinymind\data")
parser.add_argument("--project", choices=PROJECTS.keys(), default="thwiki")
parser.add_argument("--limit-pages", type=int, default=1000)
parser.add_argument("--min-chars", type=int, default=700)
args = parser.parse_args()
root = Path(args.root)
raw_dir = root / "raw" / args.project
jsonl_dir = root / "jsonl"
manifest_dir = root / "manifests"
for folder in [raw_dir, jsonl_dir, manifest_dir]:
folder.mkdir(parents=True, exist_ok=True)
dump_path = raw_dir / f"{args.project}-latest-pages-articles.xml.bz2"
download(PROJECTS[args.project], dump_path)
out_jsonl = jsonl_dir / f"{args.project}_open_sample.jsonl"
out_manifest = manifest_dir / f"{args.project}_open_manifest.jsonl"
seen = set()
accepted = 0
scanned = 0
with out_jsonl.open("w", encoding="utf-8") as data_f, out_manifest.open("w", encoding="utf-8") as manifest_f:
for title, wiki_text in iter_pages(dump_path):
scanned += 1
clean = clean_wikitext(wiki_text)
if len(clean) < args.min_chars:
continue
digest = sha256_text(clean)
if digest in seen:
continue
seen.add(digest)
record = {
"id": f"{args.project}:{title}",
"source": f"Wikimedia {args.project} dump",
"source_url": PROJECTS[args.project],
"license_family": "cc-by-sa/gfdl",
"title": title,
"language": "th",
"content_sha256": digest,
"text": clean,
}
manifest = {k: v for k, v in record.items() if k != "text"}
data_f.write(json.dumps(record, ensure_ascii=False) + "\n")
manifest_f.write(json.dumps(manifest, ensure_ascii=False) + "\n")
accepted += 1
if accepted >= args.limit_pages:
break
report = {
"project": args.project,
"scanned": scanned,
"accepted": accepted,
"jsonl": str(out_jsonl),
"manifest": str(out_manifest),
"note": "Open-licensed Thai corpus, not public-domain/CC0 pure core.",
}
(manifest_dir / f"{args.project}_open_quality_report.json").write_text(
json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8"
)
print(json.dumps(report, indent=2, ensure_ascii=False))
return 0
if __name__ == "__main__":
raise SystemExit(main())

Xet Storage Details

Size:
5.22 kB
·
Xet hash:
73d8c56a2336800317b0fc7272748fe8822a01c3e35fe8d84615eedf3f1a6025

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.