File size: 10,332 Bytes
52b30dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d57148
52b30dc
6d57148
 
52b30dc
 
 
 
 
 
 
 
 
 
 
 
 
71b2968
52b30dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfddb00
52b30dc
6d57148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52b30dc
 
 
 
 
 
 
 
 
bfddb00
 
 
 
 
 
 
 
52b30dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e7dbfc
52b30dc
 
 
 
 
 
 
 
 
 
30eb032
 
 
 
 
 
 
 
6d57148
 
52b30dc
 
 
 
 
 
 
 
 
 
 
 
 
30eb032
52b30dc
 
 
30eb032
 
 
 
 
 
 
 
 
52b30dc
 
30eb032
52b30dc
 
 
 
 
 
 
 
30eb032
 
 
 
 
 
 
 
52b30dc
 
 
 
 
30eb032
52b30dc
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
"""
Trace Commons ingestion server.

Receives anonymous donations from the donate-trace skill, re-runs the same
deterministic scrubber as a backstop, and opens a pull request to the dataset
under a single project-owned token. Contributors need no Hugging Face account.

Designed to run as a Hugging Face Space (Docker SDK) or any host that can keep
a secret. Set these as Space secrets / environment variables:

  HF_TOKEN        write-scoped token for the project bot account (required)
  DATASET_REPO    e.g. "trace-commons/agent-traces" (required)
  MAX_BYTES       max accepted payload size (optional, default 5_000_000)
  RATE_PER_HOUR   donations allowed per IP per hour (optional, default 20)

This is intentionally small. The skill already scrubbed and the user already
reviewed; the server's job is to never trust the client, re-scrub as a
backstop, refuse anything that still trips the scrubber, and submit.
"""

import io
import os
import re
import time
import json
import uuid
import shutil
import pathlib
import tempfile
import subprocess
from collections import defaultdict, deque

from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, FileResponse, HTMLResponse
from fastapi.middleware.cors import CORSMiddleware

from scrub import scrub_text  # the exact same scrubber the skill runs

HF_TOKEN = os.environ.get("HF_TOKEN")
DATASET_REPO = os.environ.get("DATASET_REPO")
MAX_BYTES = int(os.environ.get("MAX_BYTES", "5000000"))
RATE_PER_HOUR = int(os.environ.get("RATE_PER_HOUR", "20"))

VALID_HARNESS = {"claude_code", "codex", "pi", "opencode", "cursor"}
SAFE_FILENAME = re.compile(r"^[A-Za-z0-9._\-]{1,200}$")

app = FastAPI(title="Trace Commons ingestion")
app.add_middleware(
    CORSMiddleware, allow_origins=["*"], allow_methods=["POST", "GET"], allow_headers=["*"]
)

# --- simple in-memory rate limiting (per IP, sliding hour) ------------------
# For a single-process Space this is enough. Behind multiple replicas, move
# this to a shared store.
_hits = defaultdict(deque)


def _rate_ok(ip):
    now = time.time()
    window = _hits[ip]
    while window and now - window[0] > 3600:
        window.popleft()
    if len(window) >= RATE_PER_HOUR:
        return False
    window.append(now)
    return True


SITE_FILE = pathlib.Path(__file__).parent / "index.html"
OG_FILE = pathlib.Path(__file__).parent / "og.png"

TRUFFLEHOG = shutil.which("trufflehog")


def trufflehog_findings(text):
    """Authoritative secret-detection backstop.

    Runs TruffleHog (hundreds of maintained detectors) over the already-scrubbed
    trace and returns the set of detector names it flags. Detection only:
    `--no-verification` means candidate secrets are NEVER sent to third parties
    to validate them. No-ops (returns []) when the binary isn't installed, so
    local/dev runs gracefully fall back to the regex pass in scrub.py.
    """
    if not TRUFFLEHOG:
        return []
    findings = set()
    tmp_path = None
    try:
        with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False) as tf:
            tf.write(text)
            tmp_path = tf.name
        proc = subprocess.run(
            [TRUFFLEHOG, "filesystem", tmp_path,
             "--json", "--no-verification", "--no-update"],
            capture_output=True, text=True, timeout=120,
        )
        for line in proc.stdout.splitlines():
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue
            name = obj.get("DetectorName") or obj.get("DetectorType")
            if name:
                findings.add(str(name))
    except (subprocess.TimeoutExpired, OSError):
        # A scanner failure must not silently pass a donation, but the regex
        # backstop already ran; surface nothing here and let that stand.
        return []
    finally:
        if tmp_path:
            try:
                os.unlink(tmp_path)
            except OSError:
                pass
    return sorted(findings)


@app.get("/", response_class=HTMLResponse)
def home():
    """Serve the Trace Commons website."""
    if SITE_FILE.exists():
        return FileResponse(str(SITE_FILE))
    return HTMLResponse("<h1>Trace Commons</h1><p>Site file not found.</p>", status_code=200)


@app.get("/og.png")
def og_image():
    """Serve the social-preview image referenced by the page's Open Graph tags."""
    if OG_FILE.exists():
        return FileResponse(str(OG_FILE), media_type="image/png")
    return JSONResponse({"error": "not_found"}, status_code=404)


@app.get("/health")
def health():
    configured = bool(HF_TOKEN and DATASET_REPO)
    return {
        "service": "trace-commons-ingestion",
        "configured": configured,
        "dataset": DATASET_REPO or "(unset)",
    }


@app.post("/donate")
async def donate(request: Request):
    ip = request.client.host if request.client else "unknown"
    if not _rate_ok(ip):
        return JSONResponse(
            {"error": "rate_limited", "detail": "Too many donations from this address this hour."},
            status_code=429,
        )

    body = await request.body()
    if len(body) > MAX_BYTES:
        return JSONResponse(
            {"error": "too_large", "detail": f"Payload exceeds {MAX_BYTES} bytes."},
            status_code=413,
        )

    try:
        data = json.loads(body)
    except json.JSONDecodeError:
        return JSONResponse({"error": "bad_json"}, status_code=400)

    harness = data.get("harness")
    filename = data.get("filename")
    consent = data.get("consent")
    trace = data.get("trace")

    # --- validation ---------------------------------------------------------
    if harness not in VALID_HARNESS:
        return JSONResponse({"error": "bad_harness", "detail": f"harness must be one of {sorted(VALID_HARNESS)}"}, status_code=400)
    if not isinstance(trace, str) or not trace.strip():
        return JSONResponse({"error": "empty_trace"}, status_code=400)
    if consent is not True:
        return JSONResponse({"error": "no_consent", "detail": "consent must be true; the contributor must agree to open publication."}, status_code=400)
    if not filename or not SAFE_FILENAME.match(filename):
        # generate a safe one rather than trusting client input
        filename = f"{uuid.uuid4().hex}.jsonl"

    # --- backstop scrub: never trust the client ----------------------------
    cleaned, report = scrub_text(trace, harness)
    # The skill should have already removed everything. If the backstop still
    # finds high-confidence secrets, refuse: something slipped through.
    secret_kinds = {k: v for k, v in report["redactions"].items()
                    if k not in ("home_path", "email", "private_ip")}
    if secret_kinds:
        return JSONResponse(
            {
                "error": "secrets_found",
                "detail": "The server's backstop scrubber found secrets the client should have removed. Donation rejected.",
                "found": secret_kinds,
            },
            status_code=422,
        )

    # --- TruffleHog soft-warn pass over the scrubbed trace ------------------
    # Catches what the regex pass cannot (vendor tokens with no fixed prefix).
    # Run WITHOUT verification so it never transmits candidate secrets — which
    # means occasional false positives (e.g. a 32-char hash read as a "Box"
    # token). It therefore does NOT auto-reject: findings are surfaced to the
    # contributor in the response and recorded on the PR for the maintainer to
    # review, on top of the human review every donation already gets. The regex
    # pass above stays the hard block for crisp, high-confidence secret formats.
    th_detectors = trufflehog_findings(cleaned)

    if not HF_TOKEN or not DATASET_REPO:
        # Not yet configured — accept-validate but don't pretend to publish.
        return JSONResponse(
            {
                "status": "validated_not_published",
                "detail": "Server is not yet configured with a dataset target. Trace passed all checks but was not published.",
                "redactions": report["redactions"],
            },
            status_code=503,
        )

    # --- open the PR on the contributor's behalf ---------------------------
    try:
        pr_url = _open_pr(cleaned, harness, filename, warnings=th_detectors)
    except Exception as e:  # noqa: BLE001 — surface a clean message to the skill
        return JSONResponse({"error": "publish_failed", "detail": str(e)}, status_code=502)

    resp = {"status": "submitted", "pr_url": pr_url, "path": f"sessions/{harness}/{filename}"}
    if th_detectors:
        resp["warnings"] = {
            "trufflehog_unverified": th_detectors,
            "note": "TruffleHog flagged these without verification — often false positives "
                    "on high-entropy strings, so the donation was NOT blocked. Please confirm "
                    "none is a real secret; a maintainer will also review before merging.",
        }
    return resp


def _open_pr(cleaned_text, harness, filename, warnings=None):
    """Open a PR to the dataset with the cleaned trace, under the project token."""
    from huggingface_hub import HfApi, CommitOperationAdd

    api = HfApi(token=HF_TOKEN)
    op = CommitOperationAdd(
        path_in_repo=f"sessions/{harness}/{filename}",
        path_or_fileobj=io.BytesIO(cleaned_text.encode("utf-8")),
    )
    description = "Anonymous donation via Trace Commons ingestion server."
    if warnings:
        description += (
            "\n\n⚠️ **Maintainer review needed.** TruffleHog (unverified) flagged the "
            "following detector(s). These are frequently false positives on high-entropy "
            "strings (hashes, IDs, base64), but confirm none is a real secret before merging:\n- "
            + "\n- ".join(warnings)
        )
    commit = api.create_commit(
        repo_id=DATASET_REPO,
        repo_type="dataset",
        operations=[op],
        commit_message=f"Donate {harness} trace ({filename})",
        commit_description=description,
        create_pr=True,
    )
    # create_commit returns an object whose pr_url is set when create_pr=True
    return getattr(commit, "pr_url", None) or str(commit)