Spaces:
Running on Zero
Running on Zero
| """hf-loader — Hugging Face dataset loader STUB behind the SAME contract seam. | |
| Non-negotiable #3: a future loader drops in behind the identical signature | |
| load(path) -> {'events': [Event...], 'turns': [Turn...], 'session': {...}} | |
| so the engine, which consumes only the normalized contract shapes, needs ZERO | |
| changes when traces come from HF instead of local JSONL. This module is a STUB: | |
| it defines the contract and the safety policy and raises NotImplementedError on any | |
| operation that would actually touch the network with trace content. | |
| SAFETY POLICY (Non-negotiable #2 — 100% local, no egress of trace content, ever): | |
| 1. DEFAULT-PRIVATE. Any repo this loader would ever create is private by default. | |
| Publishing is an explicit, separate, opt-in step — never a side effect of load. | |
| 2. SCAN-BEFORE-PUBLISH. No trace may leave the machine until a secret/source scan | |
| has run and passed. `scan_before_publish()` is the placeholder gate; until a | |
| real scanner is wired it refuses (raises) rather than allowing egress. | |
| 3. NO NETWORK EGRESS OF TRACE CONTENT. This stub performs no fetch and no upload. | |
| `load()` and `publish()` raise NotImplementedError. Wiring a real fetch is a | |
| future, reviewed change that must keep these guarantees. | |
| Once a real fetch exists it MUST normalize HF rows into the exact same Event/Turn | |
| shapes (see engine.contract) — ideally by reusing the JSONL normalization once an | |
| HF record yields the per-line objects — so the engine stays loader-agnostic. | |
| """ | |
| from __future__ import annotations | |
| from typing import Any | |
| # --------------------------------------------------------------------------- # | |
| # safety policy constants (documented contract, enforced by the helpers below) | |
| # --------------------------------------------------------------------------- # | |
| #: Repos this loader would create are private unless an explicit publish step | |
| #: flips this AFTER a passing scan. Load never publishes. | |
| DEFAULT_PRIVATE = True | |
| #: Until a real scanner is wired, egress is refused. "No scanner" == "do not send". | |
| SCANNER_AVAILABLE = False | |
| class HFEgressBlocked(RuntimeError): | |
| """Raised when an operation would move trace content off-machine without a | |
| passing scan, or before the default-private/scan-before-publish gates exist.""" | |
| # --------------------------------------------------------------------------- # | |
| # the seam — identical signature to engine.loaders.jsonl_loader.load | |
| # --------------------------------------------------------------------------- # | |
| def load(path: str) -> dict[str, Any]: | |
| """Load a trace from a Hugging Face dataset reference, returning the contract. | |
| Same signature and same return shape as the jsonl-loader, so the engine is | |
| loader-agnostic: | |
| {'events': [Event...], 'turns': [Turn...], 'session': {...}} | |
| STUB: an actual fetch would pull trace content over the network. That is not | |
| implemented here and must be a separate, reviewed change that preserves the | |
| default-private / scan-before-publish / no-egress guarantees. A real | |
| implementation should normalize HF records into the SAME Event/Turn shapes | |
| (reuse the JSONL normalization where possible) so nothing downstream changes. | |
| """ | |
| raise NotImplementedError( | |
| "hf_loader.load is a stub: fetching trace content over the network is not " | |
| "implemented. Use engine.loaders.jsonl_loader.load for local sessions. A " | |
| "real HF fetch must normalize into the same Event/Turn contract and respect " | |
| "the default-private / scan-before-publish / no-egress policy." | |
| ) | |
| # --------------------------------------------------------------------------- # | |
| # publish-path safety placeholders (no-op stubs; egress is refused by default) | |
| # --------------------------------------------------------------------------- # | |
| def scan_before_publish(path: str) -> bool: | |
| """Placeholder secret/source scanner gate. MUST pass before any publish. | |
| Returns True only if a real scanner ran and found nothing that must not leave | |
| the machine. Until a scanner is wired (`SCANNER_AVAILABLE` is False), this | |
| refuses by raising — "no scanner" means "do not send", never "send anyway". | |
| """ | |
| if not SCANNER_AVAILABLE: | |
| raise NotImplementedError( | |
| "scan_before_publish is a stub: no secret/source scanner is wired. " | |
| "Trace content must not leave the machine until a real scan passes." | |
| ) | |
| raise NotImplementedError("scan_before_publish: real scanner not implemented.") | |
| def publish(path: str, *, private: bool = DEFAULT_PRIVATE) -> None: | |
| """Publish a trace to HF. STUB — performs NO network egress. | |
| Even once implemented, publishing must be default-private and gated on a | |
| passing `scan_before_publish`. This stub raises before any bytes could leave. | |
| """ | |
| if not private: | |
| raise HFEgressBlocked( | |
| "Refusing a non-private publish: traces carry secrets and source; " | |
| "publishing must be default-private (Non-negotiable #2)." | |
| ) | |
| # Gate on the scan even though the fetch/upload is unimplemented, so the | |
| # ordering of guarantees is explicit in code, not just in docs. | |
| scan_before_publish(path) | |
| raise NotImplementedError( | |
| "hf_loader.publish is a stub: uploading trace content is not implemented " | |
| "and must remain default-private + scan-gated when it is." | |
| ) | |