bbkdevops's picture
download
raw
1.7 kB
#!/usr/bin/env python
from __future__ import annotations
import argparse
import json
from collections import Counter
from pathlib import Path
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--root", default=r"D:\ad\tinymind\data")
args = parser.parse_args()
root = Path(args.root)
jsonl_dir = root / "jsonl"
manifests = root / "manifests"
stats = {
"root": str(root),
"jsonl_files": [],
"records": 0,
"chars": 0,
"sources": Counter(),
"licenses": Counter(),
}
for file in jsonl_dir.glob("*.jsonl"):
file_records = 0
file_chars = 0
with file.open("r", encoding="utf-8") as f:
for line in f:
if not line.strip():
continue
item = json.loads(line)
text = item.get("text", "")
file_records += 1
file_chars += len(text)
stats["sources"][item.get("source", "unknown")] += 1
stats["licenses"][item.get("license_family", "unknown")] += 1
stats["jsonl_files"].append({"path": str(file), "records": file_records, "chars": file_chars})
stats["records"] += file_records
stats["chars"] += file_chars
stats["sources"] = dict(stats["sources"])
stats["licenses"] = dict(stats["licenses"])
out = manifests / "dataset_audit.json"
manifests.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(stats, indent=2, ensure_ascii=False), encoding="utf-8")
print(json.dumps(stats, indent=2, ensure_ascii=False))
return 0
if __name__ == "__main__":
raise SystemExit(main())

Xet Storage Details

Size:
1.7 kB
·
Xet hash:
0f5b2828185199f93d9d2908d6f5c2fd37eafe28bda703f9707a3f6c62f6fc89

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.