Spaces:
Running
Running
File size: 10,332 Bytes
52b30dc 6d57148 52b30dc 6d57148 52b30dc 71b2968 52b30dc bfddb00 52b30dc 6d57148 52b30dc bfddb00 52b30dc 9e7dbfc 52b30dc 30eb032 6d57148 52b30dc 30eb032 52b30dc 30eb032 52b30dc 30eb032 52b30dc 30eb032 52b30dc 30eb032 52b30dc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 | """
Trace Commons ingestion server.
Receives anonymous donations from the donate-trace skill, re-runs the same
deterministic scrubber as a backstop, and opens a pull request to the dataset
under a single project-owned token. Contributors need no Hugging Face account.
Designed to run as a Hugging Face Space (Docker SDK) or any host that can keep
a secret. Set these as Space secrets / environment variables:
HF_TOKEN write-scoped token for the project bot account (required)
DATASET_REPO e.g. "trace-commons/agent-traces" (required)
MAX_BYTES max accepted payload size (optional, default 5_000_000)
RATE_PER_HOUR donations allowed per IP per hour (optional, default 20)
This is intentionally small. The skill already scrubbed and the user already
reviewed; the server's job is to never trust the client, re-scrub as a
backstop, refuse anything that still trips the scrubber, and submit.
"""
import io
import os
import re
import time
import json
import uuid
import shutil
import pathlib
import tempfile
import subprocess
from collections import defaultdict, deque
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, FileResponse, HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from scrub import scrub_text # the exact same scrubber the skill runs
HF_TOKEN = os.environ.get("HF_TOKEN")
DATASET_REPO = os.environ.get("DATASET_REPO")
MAX_BYTES = int(os.environ.get("MAX_BYTES", "5000000"))
RATE_PER_HOUR = int(os.environ.get("RATE_PER_HOUR", "20"))
VALID_HARNESS = {"claude_code", "codex", "pi", "opencode", "cursor"}
SAFE_FILENAME = re.compile(r"^[A-Za-z0-9._\-]{1,200}$")
app = FastAPI(title="Trace Commons ingestion")
app.add_middleware(
CORSMiddleware, allow_origins=["*"], allow_methods=["POST", "GET"], allow_headers=["*"]
)
# --- simple in-memory rate limiting (per IP, sliding hour) ------------------
# For a single-process Space this is enough. Behind multiple replicas, move
# this to a shared store.
_hits = defaultdict(deque)
def _rate_ok(ip):
now = time.time()
window = _hits[ip]
while window and now - window[0] > 3600:
window.popleft()
if len(window) >= RATE_PER_HOUR:
return False
window.append(now)
return True
SITE_FILE = pathlib.Path(__file__).parent / "index.html"
OG_FILE = pathlib.Path(__file__).parent / "og.png"
TRUFFLEHOG = shutil.which("trufflehog")
def trufflehog_findings(text):
"""Authoritative secret-detection backstop.
Runs TruffleHog (hundreds of maintained detectors) over the already-scrubbed
trace and returns the set of detector names it flags. Detection only:
`--no-verification` means candidate secrets are NEVER sent to third parties
to validate them. No-ops (returns []) when the binary isn't installed, so
local/dev runs gracefully fall back to the regex pass in scrub.py.
"""
if not TRUFFLEHOG:
return []
findings = set()
tmp_path = None
try:
with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False) as tf:
tf.write(text)
tmp_path = tf.name
proc = subprocess.run(
[TRUFFLEHOG, "filesystem", tmp_path,
"--json", "--no-verification", "--no-update"],
capture_output=True, text=True, timeout=120,
)
for line in proc.stdout.splitlines():
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
name = obj.get("DetectorName") or obj.get("DetectorType")
if name:
findings.add(str(name))
except (subprocess.TimeoutExpired, OSError):
# A scanner failure must not silently pass a donation, but the regex
# backstop already ran; surface nothing here and let that stand.
return []
finally:
if tmp_path:
try:
os.unlink(tmp_path)
except OSError:
pass
return sorted(findings)
@app.get("/", response_class=HTMLResponse)
def home():
"""Serve the Trace Commons website."""
if SITE_FILE.exists():
return FileResponse(str(SITE_FILE))
return HTMLResponse("<h1>Trace Commons</h1><p>Site file not found.</p>", status_code=200)
@app.get("/og.png")
def og_image():
"""Serve the social-preview image referenced by the page's Open Graph tags."""
if OG_FILE.exists():
return FileResponse(str(OG_FILE), media_type="image/png")
return JSONResponse({"error": "not_found"}, status_code=404)
@app.get("/health")
def health():
configured = bool(HF_TOKEN and DATASET_REPO)
return {
"service": "trace-commons-ingestion",
"configured": configured,
"dataset": DATASET_REPO or "(unset)",
}
@app.post("/donate")
async def donate(request: Request):
ip = request.client.host if request.client else "unknown"
if not _rate_ok(ip):
return JSONResponse(
{"error": "rate_limited", "detail": "Too many donations from this address this hour."},
status_code=429,
)
body = await request.body()
if len(body) > MAX_BYTES:
return JSONResponse(
{"error": "too_large", "detail": f"Payload exceeds {MAX_BYTES} bytes."},
status_code=413,
)
try:
data = json.loads(body)
except json.JSONDecodeError:
return JSONResponse({"error": "bad_json"}, status_code=400)
harness = data.get("harness")
filename = data.get("filename")
consent = data.get("consent")
trace = data.get("trace")
# --- validation ---------------------------------------------------------
if harness not in VALID_HARNESS:
return JSONResponse({"error": "bad_harness", "detail": f"harness must be one of {sorted(VALID_HARNESS)}"}, status_code=400)
if not isinstance(trace, str) or not trace.strip():
return JSONResponse({"error": "empty_trace"}, status_code=400)
if consent is not True:
return JSONResponse({"error": "no_consent", "detail": "consent must be true; the contributor must agree to open publication."}, status_code=400)
if not filename or not SAFE_FILENAME.match(filename):
# generate a safe one rather than trusting client input
filename = f"{uuid.uuid4().hex}.jsonl"
# --- backstop scrub: never trust the client ----------------------------
cleaned, report = scrub_text(trace, harness)
# The skill should have already removed everything. If the backstop still
# finds high-confidence secrets, refuse: something slipped through.
secret_kinds = {k: v for k, v in report["redactions"].items()
if k not in ("home_path", "email", "private_ip")}
if secret_kinds:
return JSONResponse(
{
"error": "secrets_found",
"detail": "The server's backstop scrubber found secrets the client should have removed. Donation rejected.",
"found": secret_kinds,
},
status_code=422,
)
# --- TruffleHog soft-warn pass over the scrubbed trace ------------------
# Catches what the regex pass cannot (vendor tokens with no fixed prefix).
# Run WITHOUT verification so it never transmits candidate secrets — which
# means occasional false positives (e.g. a 32-char hash read as a "Box"
# token). It therefore does NOT auto-reject: findings are surfaced to the
# contributor in the response and recorded on the PR for the maintainer to
# review, on top of the human review every donation already gets. The regex
# pass above stays the hard block for crisp, high-confidence secret formats.
th_detectors = trufflehog_findings(cleaned)
if not HF_TOKEN or not DATASET_REPO:
# Not yet configured — accept-validate but don't pretend to publish.
return JSONResponse(
{
"status": "validated_not_published",
"detail": "Server is not yet configured with a dataset target. Trace passed all checks but was not published.",
"redactions": report["redactions"],
},
status_code=503,
)
# --- open the PR on the contributor's behalf ---------------------------
try:
pr_url = _open_pr(cleaned, harness, filename, warnings=th_detectors)
except Exception as e: # noqa: BLE001 — surface a clean message to the skill
return JSONResponse({"error": "publish_failed", "detail": str(e)}, status_code=502)
resp = {"status": "submitted", "pr_url": pr_url, "path": f"sessions/{harness}/{filename}"}
if th_detectors:
resp["warnings"] = {
"trufflehog_unverified": th_detectors,
"note": "TruffleHog flagged these without verification — often false positives "
"on high-entropy strings, so the donation was NOT blocked. Please confirm "
"none is a real secret; a maintainer will also review before merging.",
}
return resp
def _open_pr(cleaned_text, harness, filename, warnings=None):
"""Open a PR to the dataset with the cleaned trace, under the project token."""
from huggingface_hub import HfApi, CommitOperationAdd
api = HfApi(token=HF_TOKEN)
op = CommitOperationAdd(
path_in_repo=f"sessions/{harness}/{filename}",
path_or_fileobj=io.BytesIO(cleaned_text.encode("utf-8")),
)
description = "Anonymous donation via Trace Commons ingestion server."
if warnings:
description += (
"\n\n⚠️ **Maintainer review needed.** TruffleHog (unverified) flagged the "
"following detector(s). These are frequently false positives on high-entropy "
"strings (hashes, IDs, base64), but confirm none is a real secret before merging:\n- "
+ "\n- ".join(warnings)
)
commit = api.create_commit(
repo_id=DATASET_REPO,
repo_type="dataset",
operations=[op],
commit_message=f"Donate {harness} trace ({filename})",
commit_description=description,
create_pr=True,
)
# create_commit returns an object whose pr_url is set when create_pr=True
return getattr(commit, "pr_url", None) or str(commit)
|