DGX_AI / codeforge /scraper /writer.py
vasiuuu's picture
Initial commit for CodeForge GRPO training
acf77ab
from __future__ import annotations
import hashlib
import json
from collections.abc import Iterable
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
def write_corpus(nodes: Iterable[dict[str, Any]], out_path: Path) -> Path:
"""Write nodes as sorted JSONL."""
out_path.parent.mkdir(parents=True, exist_ok=True)
sorted_nodes = sorted(nodes, key=lambda n: str(n["id"]))
with out_path.open("w", encoding="utf-8") as f:
for node in sorted_nodes:
f.write(json.dumps(node, default=str))
f.write("\n")
return out_path
def compute_corpus_sha256(corpus_path: Path) -> str:
"""SHA256 hex digest of the corpus file."""
return hashlib.sha256(corpus_path.read_bytes()).hexdigest()
def write_manifest(
*,
corpus_path: Path,
sources: list[dict[str, str]],
scraped_files: int,
skipped_files: int,
total_nodes: int,
errors: list[dict[str, str]],
) -> Path:
"""Write a manifest JSON alongside the corpus."""
digest = compute_corpus_sha256(corpus_path)
manifest = {
"generated_at": datetime.now(UTC).isoformat(timespec="seconds"),
"sources": sources,
"scraped_files": scraped_files,
"skipped_files": skipped_files,
"total_nodes": total_nodes,
"errors": errors,
"corpus_sha256": digest,
}
manifest_path = corpus_path.with_suffix(".manifest.json")
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(
json.dumps(manifest, indent=2), encoding="utf-8"
)
return manifest_path