Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

30 days ago

1.7 kB

	#!/usr/bin/env python
	from __future__ import annotations

	import argparse
	import json
	from collections import Counter
	from pathlib import Path


	def main() -> int:
	parser = argparse.ArgumentParser()
	parser.add_argument("--root", default=r"D:\ad\tinymind\data")
	args = parser.parse_args()
	root = Path(args.root)
	jsonl_dir = root / "jsonl"
	manifests = root / "manifests"

	stats = {
	"root": str(root),
	"jsonl_files": [],
	"records": 0,
	"chars": 0,
	"sources": Counter(),
	"licenses": Counter(),
	}

	for file in jsonl_dir.glob("*.jsonl"):
	file_records = 0
	file_chars = 0
	with file.open("r", encoding="utf-8") as f:
	for line in f:
	if not line.strip():
	continue
	item = json.loads(line)
	text = item.get("text", "")
	file_records += 1
	file_chars += len(text)
	stats["sources"][item.get("source", "unknown")] += 1
	stats["licenses"][item.get("license_family", "unknown")] += 1
	stats["jsonl_files"].append({"path": str(file), "records": file_records, "chars": file_chars})
	stats["records"] += file_records
	stats["chars"] += file_chars

	stats["sources"] = dict(stats["sources"])
	stats["licenses"] = dict(stats["licenses"])
	out = manifests / "dataset_audit.json"
	manifests.mkdir(parents=True, exist_ok=True)
	out.write_text(json.dumps(stats, indent=2, ensure_ascii=False), encoding="utf-8")
	print(json.dumps(stats, indent=2, ensure_ascii=False))
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.