| """Secret scrubbing for session trajectories before upload. |
| |
| Users frequently paste HF / API / GitHub tokens into the chat, or scripts echo |
| them via env dumps. This module applies regex-based redaction to any string |
| value found recursively in a trajectory payload. The goal is best-effort — |
| strict formats are matched; we won't catch free-form leaks like "my password |
| is hunter2". |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
| from typing import Any |
|
|
| |
| |
| |
| _PATTERNS: list[tuple[re.Pattern, str]] = [ |
| |
| (re.compile(r"hf_[A-Za-z0-9]{30,}"), "[REDACTED_HF_TOKEN]"), |
| |
| (re.compile(r"sk-ant-[A-Za-z0-9_\-]{20,}"), "[REDACTED_ANTHROPIC_KEY]"), |
| |
| (re.compile(r"sk-(?!ant-)[A-Za-z0-9_\-]{40,}"), "[REDACTED_OPENAI_KEY]"), |
| |
| (re.compile(r"gh[pousr]_[A-Za-z0-9]{36,}"), "[REDACTED_GITHUB_TOKEN]"), |
| |
| (re.compile(r"github_pat_[A-Za-z0-9_]{36,}"), "[REDACTED_GITHUB_TOKEN]"), |
| |
| (re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b"), "[REDACTED_AWS_KEY_ID]"), |
| |
| (re.compile(r"(?i)bearer\s+[A-Za-z0-9_\-\.=]{20,}"), "Bearer [REDACTED]"), |
| ] |
|
|
| |
| |
| |
| _SECRETY_NAMES = re.compile( |
| r"(?i)\b(HF_TOKEN|HUGGINGFACEHUB_API_TOKEN|ANTHROPIC_API_KEY|OPENAI_API_KEY|" |
| r"GITHUB_TOKEN|AWS_SECRET_ACCESS_KEY|AWS_ACCESS_KEY_ID|PASSWORD|SECRET|API_KEY)" |
| r"\s*[:=]\s*([^\s\"']+)" |
| ) |
|
|
|
|
| def scrub_string(s: str) -> str: |
| """Apply all redaction patterns to a single string. Safe on non-strings.""" |
| if not isinstance(s, str) or not s: |
| return s |
| out = s |
| for pat, repl in _PATTERNS: |
| out = pat.sub(repl, out) |
| out = _SECRETY_NAMES.sub(lambda m: f"{m.group(1)}=[REDACTED]", out) |
| return out |
|
|
|
|
| def scrub(obj: Any) -> Any: |
| """Recursively scrub every string value in a nested dict/list structure. |
| |
| Returns a new object — inputs are not mutated.""" |
| if isinstance(obj, str): |
| return scrub_string(obj) |
| if isinstance(obj, dict): |
| return {k: scrub(v) for k, v in obj.items()} |
| if isinstance(obj, list): |
| return [scrub(v) for v in obj] |
| if isinstance(obj, tuple): |
| return tuple(scrub(v) for v in obj) |
| return obj |
|
|