her / engine /loaders /hf_loader.py
geekwrestler's picture
Squash history (purge pre-scrub demo session blobs)
5f43c7d
"""hf-loader — Hugging Face dataset loader STUB behind the SAME contract seam.
Non-negotiable #3: a future loader drops in behind the identical signature
load(path) -> {'events': [Event...], 'turns': [Turn...], 'session': {...}}
so the engine, which consumes only the normalized contract shapes, needs ZERO
changes when traces come from HF instead of local JSONL. This module is a STUB:
it defines the contract and the safety policy and raises NotImplementedError on any
operation that would actually touch the network with trace content.
SAFETY POLICY (Non-negotiable #2 — 100% local, no egress of trace content, ever):
1. DEFAULT-PRIVATE. Any repo this loader would ever create is private by default.
Publishing is an explicit, separate, opt-in step — never a side effect of load.
2. SCAN-BEFORE-PUBLISH. No trace may leave the machine until a secret/source scan
has run and passed. `scan_before_publish()` is the placeholder gate; until a
real scanner is wired it refuses (raises) rather than allowing egress.
3. NO NETWORK EGRESS OF TRACE CONTENT. This stub performs no fetch and no upload.
`load()` and `publish()` raise NotImplementedError. Wiring a real fetch is a
future, reviewed change that must keep these guarantees.
Once a real fetch exists it MUST normalize HF rows into the exact same Event/Turn
shapes (see engine.contract) — ideally by reusing the JSONL normalization once an
HF record yields the per-line objects — so the engine stays loader-agnostic.
"""
from __future__ import annotations
from typing import Any
# --------------------------------------------------------------------------- #
# safety policy constants (documented contract, enforced by the helpers below)
# --------------------------------------------------------------------------- #
#: Repos this loader would create are private unless an explicit publish step
#: flips this AFTER a passing scan. Load never publishes.
DEFAULT_PRIVATE = True
#: Until a real scanner is wired, egress is refused. "No scanner" == "do not send".
SCANNER_AVAILABLE = False
class HFEgressBlocked(RuntimeError):
"""Raised when an operation would move trace content off-machine without a
passing scan, or before the default-private/scan-before-publish gates exist."""
# --------------------------------------------------------------------------- #
# the seam — identical signature to engine.loaders.jsonl_loader.load
# --------------------------------------------------------------------------- #
def load(path: str) -> dict[str, Any]:
"""Load a trace from a Hugging Face dataset reference, returning the contract.
Same signature and same return shape as the jsonl-loader, so the engine is
loader-agnostic:
{'events': [Event...], 'turns': [Turn...], 'session': {...}}
STUB: an actual fetch would pull trace content over the network. That is not
implemented here and must be a separate, reviewed change that preserves the
default-private / scan-before-publish / no-egress guarantees. A real
implementation should normalize HF records into the SAME Event/Turn shapes
(reuse the JSONL normalization where possible) so nothing downstream changes.
"""
raise NotImplementedError(
"hf_loader.load is a stub: fetching trace content over the network is not "
"implemented. Use engine.loaders.jsonl_loader.load for local sessions. A "
"real HF fetch must normalize into the same Event/Turn contract and respect "
"the default-private / scan-before-publish / no-egress policy."
)
# --------------------------------------------------------------------------- #
# publish-path safety placeholders (no-op stubs; egress is refused by default)
# --------------------------------------------------------------------------- #
def scan_before_publish(path: str) -> bool:
"""Placeholder secret/source scanner gate. MUST pass before any publish.
Returns True only if a real scanner ran and found nothing that must not leave
the machine. Until a scanner is wired (`SCANNER_AVAILABLE` is False), this
refuses by raising — "no scanner" means "do not send", never "send anyway".
"""
if not SCANNER_AVAILABLE:
raise NotImplementedError(
"scan_before_publish is a stub: no secret/source scanner is wired. "
"Trace content must not leave the machine until a real scan passes."
)
raise NotImplementedError("scan_before_publish: real scanner not implemented.")
def publish(path: str, *, private: bool = DEFAULT_PRIVATE) -> None:
"""Publish a trace to HF. STUB — performs NO network egress.
Even once implemented, publishing must be default-private and gated on a
passing `scan_before_publish`. This stub raises before any bytes could leave.
"""
if not private:
raise HFEgressBlocked(
"Refusing a non-private publish: traces carry secrets and source; "
"publishing must be default-private (Non-negotiable #2)."
)
# Gate on the scan even though the fetch/upload is unimplemented, so the
# ordering of guarantees is explicit in code, not just in docs.
scan_before_publish(path)
raise NotImplementedError(
"hf_loader.publish is a stub: uploading trace content is not implemented "
"and must remain default-private + scan-gated when it is."
)