Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /scripts /audit_dataset.py
| #!/usr/bin/env python | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| from collections import Counter | |
| from pathlib import Path | |
| def main() -> int: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--root", default=r"D:\ad\tinymind\data") | |
| args = parser.parse_args() | |
| root = Path(args.root) | |
| jsonl_dir = root / "jsonl" | |
| manifests = root / "manifests" | |
| stats = { | |
| "root": str(root), | |
| "jsonl_files": [], | |
| "records": 0, | |
| "chars": 0, | |
| "sources": Counter(), | |
| "licenses": Counter(), | |
| } | |
| for file in jsonl_dir.glob("*.jsonl"): | |
| file_records = 0 | |
| file_chars = 0 | |
| with file.open("r", encoding="utf-8") as f: | |
| for line in f: | |
| if not line.strip(): | |
| continue | |
| item = json.loads(line) | |
| text = item.get("text", "") | |
| file_records += 1 | |
| file_chars += len(text) | |
| stats["sources"][item.get("source", "unknown")] += 1 | |
| stats["licenses"][item.get("license_family", "unknown")] += 1 | |
| stats["jsonl_files"].append({"path": str(file), "records": file_records, "chars": file_chars}) | |
| stats["records"] += file_records | |
| stats["chars"] += file_chars | |
| stats["sources"] = dict(stats["sources"]) | |
| stats["licenses"] = dict(stats["licenses"]) | |
| out = manifests / "dataset_audit.json" | |
| manifests.mkdir(parents=True, exist_ok=True) | |
| out.write_text(json.dumps(stats, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print(json.dumps(stats, indent=2, ensure_ascii=False)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |
Xet Storage Details
- Size:
- 1.7 kB
- Xet hash:
- 0f5b2828185199f93d9d2908d6f5c2fd37eafe28bda703f9707a3f6c62f6fc89
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.