"""hf-loader — Hugging Face dataset loader STUB behind the SAME contract seam. Non-negotiable #3: a future loader drops in behind the identical signature load(path) -> {'events': [Event...], 'turns': [Turn...], 'session': {...}} so the engine, which consumes only the normalized contract shapes, needs ZERO changes when traces come from HF instead of local JSONL. This module is a STUB: it defines the contract and the safety policy and raises NotImplementedError on any operation that would actually touch the network with trace content. SAFETY POLICY (Non-negotiable #2 — 100% local, no egress of trace content, ever): 1. DEFAULT-PRIVATE. Any repo this loader would ever create is private by default. Publishing is an explicit, separate, opt-in step — never a side effect of load. 2. SCAN-BEFORE-PUBLISH. No trace may leave the machine until a secret/source scan has run and passed. `scan_before_publish()` is the placeholder gate; until a real scanner is wired it refuses (raises) rather than allowing egress. 3. NO NETWORK EGRESS OF TRACE CONTENT. This stub performs no fetch and no upload. `load()` and `publish()` raise NotImplementedError. Wiring a real fetch is a future, reviewed change that must keep these guarantees. Once a real fetch exists it MUST normalize HF rows into the exact same Event/Turn shapes (see engine.contract) — ideally by reusing the JSONL normalization once an HF record yields the per-line objects — so the engine stays loader-agnostic. """ from __future__ import annotations from typing import Any # --------------------------------------------------------------------------- # # safety policy constants (documented contract, enforced by the helpers below) # --------------------------------------------------------------------------- # #: Repos this loader would create are private unless an explicit publish step #: flips this AFTER a passing scan. Load never publishes. DEFAULT_PRIVATE = True #: Until a real scanner is wired, egress is refused. "No scanner" == "do not send". SCANNER_AVAILABLE = False class HFEgressBlocked(RuntimeError): """Raised when an operation would move trace content off-machine without a passing scan, or before the default-private/scan-before-publish gates exist.""" # --------------------------------------------------------------------------- # # the seam — identical signature to engine.loaders.jsonl_loader.load # --------------------------------------------------------------------------- # def load(path: str) -> dict[str, Any]: """Load a trace from a Hugging Face dataset reference, returning the contract. Same signature and same return shape as the jsonl-loader, so the engine is loader-agnostic: {'events': [Event...], 'turns': [Turn...], 'session': {...}} STUB: an actual fetch would pull trace content over the network. That is not implemented here and must be a separate, reviewed change that preserves the default-private / scan-before-publish / no-egress guarantees. A real implementation should normalize HF records into the SAME Event/Turn shapes (reuse the JSONL normalization where possible) so nothing downstream changes. """ raise NotImplementedError( "hf_loader.load is a stub: fetching trace content over the network is not " "implemented. Use engine.loaders.jsonl_loader.load for local sessions. A " "real HF fetch must normalize into the same Event/Turn contract and respect " "the default-private / scan-before-publish / no-egress policy." ) # --------------------------------------------------------------------------- # # publish-path safety placeholders (no-op stubs; egress is refused by default) # --------------------------------------------------------------------------- # def scan_before_publish(path: str) -> bool: """Placeholder secret/source scanner gate. MUST pass before any publish. Returns True only if a real scanner ran and found nothing that must not leave the machine. Until a scanner is wired (`SCANNER_AVAILABLE` is False), this refuses by raising — "no scanner" means "do not send", never "send anyway". """ if not SCANNER_AVAILABLE: raise NotImplementedError( "scan_before_publish is a stub: no secret/source scanner is wired. " "Trace content must not leave the machine until a real scan passes." ) raise NotImplementedError("scan_before_publish: real scanner not implemented.") def publish(path: str, *, private: bool = DEFAULT_PRIVATE) -> None: """Publish a trace to HF. STUB — performs NO network egress. Even once implemented, publishing must be default-private and gated on a passing `scan_before_publish`. This stub raises before any bytes could leave. """ if not private: raise HFEgressBlocked( "Refusing a non-private publish: traces carry secrets and source; " "publishing must be default-private (Non-negotiable #2)." ) # Gate on the scan even though the fetch/upload is unimplemented, so the # ordering of guarantees is explicit in code, not just in docs. scan_before_publish(path) raise NotImplementedError( "hf_loader.publish is a stub: uploading trace content is not implemented " "and must remain default-private + scan-gated when it is." )