monsimas commited on
Commit
30eb032
·
verified ·
1 Parent(s): 4ada902

TruffleHog backstop: soft-warn (flag for review) instead of hard-reject; drop FP Twilio regex

Browse files
Files changed (1) hide show
  1. app.py +28 -16
app.py CHANGED
@@ -197,19 +197,15 @@ async def donate(request: Request):
197
  status_code=422,
198
  )
199
 
200
- # --- authoritative backstop: TruffleHog over the scrubbed trace ---------
201
- # Catches what the regex pass cannot (vendor tokens with no fixed prefix,
202
- # newly-rotated formats, etc.). Any flagged detector rejects the donation.
 
 
 
 
 
203
  th_detectors = trufflehog_findings(cleaned)
204
- if th_detectors:
205
- return JSONResponse(
206
- {
207
- "error": "secrets_found",
208
- "detail": "TruffleHog (server backstop) flagged likely secrets that survived the client scrub. Donation rejected — clean the flagged values and retry.",
209
- "detectors": th_detectors,
210
- },
211
- status_code=422,
212
- )
213
 
214
  if not HF_TOKEN or not DATASET_REPO:
215
  # Not yet configured — accept-validate but don't pretend to publish.
@@ -224,14 +220,22 @@ async def donate(request: Request):
224
 
225
  # --- open the PR on the contributor's behalf ---------------------------
226
  try:
227
- pr_url = _open_pr(cleaned, harness, filename)
228
  except Exception as e: # noqa: BLE001 — surface a clean message to the skill
229
  return JSONResponse({"error": "publish_failed", "detail": str(e)}, status_code=502)
230
 
231
- return {"status": "submitted", "pr_url": pr_url, "path": f"sessions/{harness}/{filename}"}
 
 
 
 
 
 
 
 
232
 
233
 
234
- def _open_pr(cleaned_text, harness, filename):
235
  """Open a PR to the dataset with the cleaned trace, under the project token."""
236
  from huggingface_hub import HfApi, CommitOperationAdd
237
 
@@ -240,12 +244,20 @@ def _open_pr(cleaned_text, harness, filename):
240
  path_in_repo=f"sessions/{harness}/{filename}",
241
  path_or_fileobj=io.BytesIO(cleaned_text.encode("utf-8")),
242
  )
 
 
 
 
 
 
 
 
243
  commit = api.create_commit(
244
  repo_id=DATASET_REPO,
245
  repo_type="dataset",
246
  operations=[op],
247
  commit_message=f"Donate {harness} trace ({filename})",
248
- commit_description="Anonymous donation via Trace Commons ingestion server.",
249
  create_pr=True,
250
  )
251
  # create_commit returns an object whose pr_url is set when create_pr=True
 
197
  status_code=422,
198
  )
199
 
200
+ # --- TruffleHog soft-warn pass over the scrubbed trace ------------------
201
+ # Catches what the regex pass cannot (vendor tokens with no fixed prefix).
202
+ # Run WITHOUT verification so it never transmits candidate secrets — which
203
+ # means occasional false positives (e.g. a 32-char hash read as a "Box"
204
+ # token). It therefore does NOT auto-reject: findings are surfaced to the
205
+ # contributor in the response and recorded on the PR for the maintainer to
206
+ # review, on top of the human review every donation already gets. The regex
207
+ # pass above stays the hard block for crisp, high-confidence secret formats.
208
  th_detectors = trufflehog_findings(cleaned)
 
 
 
 
 
 
 
 
 
209
 
210
  if not HF_TOKEN or not DATASET_REPO:
211
  # Not yet configured — accept-validate but don't pretend to publish.
 
220
 
221
  # --- open the PR on the contributor's behalf ---------------------------
222
  try:
223
+ pr_url = _open_pr(cleaned, harness, filename, warnings=th_detectors)
224
  except Exception as e: # noqa: BLE001 — surface a clean message to the skill
225
  return JSONResponse({"error": "publish_failed", "detail": str(e)}, status_code=502)
226
 
227
+ resp = {"status": "submitted", "pr_url": pr_url, "path": f"sessions/{harness}/{filename}"}
228
+ if th_detectors:
229
+ resp["warnings"] = {
230
+ "trufflehog_unverified": th_detectors,
231
+ "note": "TruffleHog flagged these without verification — often false positives "
232
+ "on high-entropy strings, so the donation was NOT blocked. Please confirm "
233
+ "none is a real secret; a maintainer will also review before merging.",
234
+ }
235
+ return resp
236
 
237
 
238
+ def _open_pr(cleaned_text, harness, filename, warnings=None):
239
  """Open a PR to the dataset with the cleaned trace, under the project token."""
240
  from huggingface_hub import HfApi, CommitOperationAdd
241
 
 
244
  path_in_repo=f"sessions/{harness}/{filename}",
245
  path_or_fileobj=io.BytesIO(cleaned_text.encode("utf-8")),
246
  )
247
+ description = "Anonymous donation via Trace Commons ingestion server."
248
+ if warnings:
249
+ description += (
250
+ "\n\n⚠️ **Maintainer review needed.** TruffleHog (unverified) flagged the "
251
+ "following detector(s). These are frequently false positives on high-entropy "
252
+ "strings (hashes, IDs, base64), but confirm none is a real secret before merging:\n- "
253
+ + "\n- ".join(warnings)
254
+ )
255
  commit = api.create_commit(
256
  repo_id=DATASET_REPO,
257
  repo_type="dataset",
258
  operations=[op],
259
  commit_message=f"Donate {harness} trace ({filename})",
260
+ commit_description=description,
261
  create_pr=True,
262
  )
263
  # create_commit returns an object whose pr_url is set when create_pr=True