from __future__ import annotations import hashlib import json from collections.abc import Iterable from datetime import UTC, datetime from pathlib import Path from typing import Any def write_corpus(nodes: Iterable[dict[str, Any]], out_path: Path) -> Path: """Write nodes as sorted JSONL.""" out_path.parent.mkdir(parents=True, exist_ok=True) sorted_nodes = sorted(nodes, key=lambda n: str(n["id"])) with out_path.open("w", encoding="utf-8") as f: for node in sorted_nodes: f.write(json.dumps(node, default=str)) f.write("\n") return out_path def compute_corpus_sha256(corpus_path: Path) -> str: """SHA256 hex digest of the corpus file.""" return hashlib.sha256(corpus_path.read_bytes()).hexdigest() def write_manifest( *, corpus_path: Path, sources: list[dict[str, str]], scraped_files: int, skipped_files: int, total_nodes: int, errors: list[dict[str, str]], ) -> Path: """Write a manifest JSON alongside the corpus.""" digest = compute_corpus_sha256(corpus_path) manifest = { "generated_at": datetime.now(UTC).isoformat(timespec="seconds"), "sources": sources, "scraped_files": scraped_files, "skipped_files": skipped_files, "total_nodes": total_nodes, "errors": errors, "corpus_sha256": digest, } manifest_path = corpus_path.with_suffix(".manifest.json") manifest_path.parent.mkdir(parents=True, exist_ok=True) manifest_path.write_text( json.dumps(manifest, indent=2), encoding="utf-8" ) return manifest_path