lost-found-desk / scripts /build_codex_trace_dataset.py
JacobLinCool's picture
Use official Codex agent trace format
be3b0fa verified
Raw
History Blame Contribute Delete
22.8 kB
from __future__ import annotations
import argparse
import hashlib
import json
import re
import shlex
import shutil
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
PROJECT_NAME = "Lost & Found Desk"
PROJECT_REPO = "https://github.com/JacobLinCool/Lost-n-Found-Desk"
SPACE_REPO = "https://huggingface.co/spaces/build-small-hackathon/lost-found-desk"
APP_URL = "https://build-small-hackathon-lost-found-desk.hf.space"
ARTICLE_URL = "https://huggingface.co/spaces/build-small-hackathon/lost-found-desk/blob/main/docs/article.md"
DEMO_URL = "https://youtu.be/AsOM7K0tL-s"
X_POST_URL = "https://x.com/JacobLinCool/status/2066147773481951378"
DATASET_URL = "https://huggingface.co/datasets/build-small-hackathon/lost-found-desk-codex-traces"
COLLECTION_URL = "https://huggingface.co/collections/build-small-hackathon/lost-and-found-desk-6a2ec0551c48861e92dd8443"
HF_AGENT_TRACES_DOCS = "https://huggingface.co/docs/hub/en/agent-traces"
HF_AGENT_TRACE_CHANGELOG = "https://huggingface.co/changelog/agent-trace-viewer"
MODELS = [
"openbmb/MiniCPM-V-4.6",
"openbmb/MiniCPM5-1B",
"nvidia/llama-nemotron-embed-vl-1b-v2",
]
TASK_SUMMARIES = {
"我們準備要提交目前這個專案到 hackathon": {
"task_id": "submission_hardening",
"summary": "Audit official Build Small requirements, harden runtime defaults, deploy the Space, publish GitHub, add documentation, and verify submission links.",
"public_outputs": [
SPACE_REPO,
APP_URL,
PROJECT_REPO,
],
},
"should use the youtube link for demo video": {
"task_id": "youtube_demo_link",
"summary": "Replace repository and Space demo-video references with the public YouTube demo link.",
"public_outputs": [DEMO_URL],
},
"for social media post, we are going to post on X": {
"task_id": "x_post_draft",
"summary": "Shorten the social-media draft into an X-ready post within the platform character budget.",
"public_outputs": [],
},
"here's the link of the post": {
"task_id": "published_x_post",
"summary": "Add the published X post URL to README, submission notes, and Space materials.",
"public_outputs": [X_POST_URL],
},
"did you write article on huggingface": {
"task_id": "article_publication_status",
"summary": "Clarify that the article exists as repository and Space markdown rather than a separate Hugging Face Community Article.",
"public_outputs": [ARTICLE_URL],
},
"i think for article, we should put the yt link": {
"task_id": "article_demo_link_position",
"summary": "Move the YouTube demo link near the top of the article files.",
"public_outputs": [DEMO_URL, ARTICLE_URL],
},
"consider to use": {
"task_id": "academic_article_rewrite",
"summary": "Rewrite the public article with a clearer argument, scope, privacy boundary, evidence, and limitation structure.",
"public_outputs": [ARTICLE_URL],
},
"looks good, and i think our last step": {
"task_id": "trace_dataset_and_collection",
"summary": "Create a sanitized Codex trace dataset and a Hugging Face collection that groups the Space, models, article link, and dataset.",
"public_outputs": [DATASET_URL, COLLECTION_URL],
},
}
PUBLIC_ARTIFACTS = [
{
"artifact_type": "space",
"name": "Lost & Found Desk Space",
"url": SPACE_REPO,
"repo_id": "build-small-hackathon/lost-found-desk",
},
{
"artifact_type": "app",
"name": "Lost & Found Desk live app",
"url": APP_URL,
"repo_id": "build-small-hackathon/lost-found-desk",
},
{
"artifact_type": "repository",
"name": "GitHub repository",
"url": PROJECT_REPO,
"repo_id": "JacobLinCool/Lost-n-Found-Desk",
},
{
"artifact_type": "article",
"name": "Project article",
"url": ARTICLE_URL,
"repo_id": "build-small-hackathon/lost-found-desk",
},
{
"artifact_type": "demo_video",
"name": "YouTube demo video",
"url": DEMO_URL,
"repo_id": "",
},
{
"artifact_type": "social_post",
"name": "X post",
"url": X_POST_URL,
"repo_id": "",
},
{
"artifact_type": "dataset",
"name": "Codex trace dataset",
"url": DATASET_URL,
"repo_id": "build-small-hackathon/lost-found-desk-codex-traces",
},
{
"artifact_type": "collection",
"name": "Lost & Found Desk Hugging Face collection",
"url": COLLECTION_URL,
"repo_id": "build-small-hackathon/lost-and-found-desk-6a2ec0551c48861e92dd8443",
},
*[
{
"artifact_type": "model",
"name": model,
"url": f"https://huggingface.co/{model}",
"repo_id": model,
}
for model in MODELS
],
]
TOKEN_PATTERNS = [
re.compile(r"hf_[A-Za-z0-9]{20,}"),
re.compile(r"(?<![A-Za-z0-9])sk-[A-Za-z0-9_-]{20,}"),
re.compile(r"github_pat_[A-Za-z0-9_]{20,}"),
]
SENSITIVE_LABELS = [
re.compile(r"github_pat_"),
re.compile(r"OPENAI_API_KEY"),
re.compile(r"HF_TOKEN"),
re.compile(r"Authorization"),
re.compile(r"BEGIN[^\n\r]{0,120}PRIVATE"),
]
ABS_PATH = re.compile(r"/Users/jacoblincool(?:/[^\s\"'`<>)]*)?")
def redact_text(text: str, counts: Counter[str] | None = None) -> str:
text, replacements = ABS_PATH.subn("$LOCAL_PATH", text)
if counts is not None:
counts["local_path"] += replacements
for pattern in TOKEN_PATTERNS:
text, replacements = pattern.subn("[REDACTED_TOKEN]", text)
if counts is not None:
counts["token_shaped_string"] += replacements
for pattern in SENSITIVE_LABELS:
text, replacements = pattern.subn("[REDACTED_SECRET_LABEL]", text)
if counts is not None:
counts["secret_label"] += replacements
return text
def redact_value(value: Any, counts: Counter[str]) -> Any:
if isinstance(value, str):
return redact_text(value, counts)
if isinstance(value, list):
return [redact_value(item, counts) for item in value]
if isinstance(value, dict):
return {
redact_text(key, counts) if isinstance(key, str) else key: redact_value(item, counts)
for key, item in value.items()
}
return value
def short_hash(value: str) -> str:
return hashlib.sha256(value.encode("utf-8")).hexdigest()[:16]
def read_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
for line in path.read_text(encoding="utf-8").splitlines():
if line.strip():
rows.append(json.loads(line))
return rows
def classify_task(message: str) -> dict[str, Any]:
for needle, task in TASK_SUMMARIES.items():
if needle in message:
return task
return {
"task_id": f"task_{short_hash(message)}",
"summary": redact_text(message[:240]).strip(),
"public_outputs": [],
}
def tool_summary(name: str, arguments: str) -> tuple[str, str]:
if name == "exec_command":
try:
args = json.loads(arguments)
cmd = args.get("cmd", "")
except Exception:
cmd = arguments
redacted = redact_text(cmd)
sensitive_markers = (
"github_pat_",
"ghp_",
"gho_",
"hf_",
"sk-",
"OPENAI_API_KEY",
"HF_TOKEN",
"Authorization",
"PRIVATE KEY",
"password\\s*=",
"token\\s*=",
)
if any(marker.lower() in redacted.lower() for marker in sensitive_markers):
return "shell", "secret scan command"
if ".env.example" in redacted:
return "shell", "inspect example environment file"
try:
parts = shlex.split(redacted)
except ValueError:
parts = redacted.split()
if not parts:
return "shell", ""
executable = parts[0]
if executable in {"git", "gh", "hf", "curl", "uv", "rg", "sed", "sqlite3", "python3"}:
return "shell", " ".join(parts[:6])
return "shell", executable
if name == "apply_patch":
return "edit", "apply_patch"
if name in {"web_search_call", "web_search_end"}:
return "web", name
return "tool", name
def extract_exit_code(output: str) -> int | None:
match = re.search(r"Process exited with code (-?\d+)", output)
return int(match.group(1)) if match else None
def is_relevant_public_ref(ref: str) -> bool:
if "\\" in ref or "${" in ref:
return False
parsed = urlparse(ref)
host = parsed.netloc.lower()
path = parsed.path
if host == "build-small-hackathon-lost-found-desk.hf.space":
return True
if host == "build-small-hackathon-field-guide.hf.space":
return path in {"/", "/submit"}
if host == "youtu.be" and path == "/AsOM7K0tL-s":
return True
if host == "x.com" and (ref == X_POST_URL or path == "/buildsmall"):
return True
if host == "github.com":
return path.startswith("/JacobLinCool/Lost-n-Found-Desk")
if host == "huggingface.co":
return (
path.startswith("/spaces/build-small-hackathon/lost-found-desk")
or path.startswith("/datasets/build-small-hackathon/lost-found-desk-codex-traces")
or path.startswith("/collections/build-small-hackathon/lost-and-found-desk-6a2ec0551c48861e92dd8443")
or path in {"/openbmb/MiniCPM-V-4.6", "/openbmb/MiniCPM5-1B", "/nvidia/llama-nemotron-embed-vl-1b-v2"}
or path.startswith("/docs/")
or path == "/BuildSmall"
)
return False
def extract_public_refs(text: str) -> list[str]:
refs: set[str] = set()
for raw in re.findall(r"https?://[^\s<>\]\)\"'`]+", text):
ref = raw.rstrip(".,;:,。\\")
if is_relevant_public_ref(ref):
refs.add(ref)
return sorted(refs)
def build_dataset(rollout_path: Path, out_dir: Path, collection_url: str | None = None) -> None:
rows = read_jsonl(rollout_path)
source_digest = hashlib.sha256(rollout_path.read_bytes()).hexdigest()
session_meta = next((r.get("payload", {}) for r in rows if r.get("type") == "session_meta"), {})
thread_id = session_meta.get("id") or rollout_path.stem.rsplit("-", 1)[-1]
created_at = session_meta.get("timestamp")
turns: list[dict[str, Any]] = []
tool_events: list[dict[str, Any]] = []
task_by_sequence: dict[int, str] = {}
current_task = ""
task_sequence = -1
tool_call_by_id: dict[str, dict[str, Any]] = {}
counters: Counter[str] = Counter()
for event_index, row in enumerate(rows):
timestamp = row.get("timestamp")
payload = row.get("payload") or {}
payload_type = payload.get("type") if isinstance(payload, dict) else None
if payload_type == "user_message":
message = str(payload.get("message") or payload.get("content") or "")
task_sequence += 1
task = classify_task(message)
current_task = task["task_id"]
task_by_sequence[task_sequence] = current_task
turns.append(
{
"project": PROJECT_NAME,
"thread_id_hash": short_hash(thread_id),
"source_session_digest": source_digest[:16],
"turn_index": task_sequence,
"task_id": current_task,
"timestamp": timestamp,
"role": "user",
"summary": task["summary"],
"message_sha256": hashlib.sha256(message.encode("utf-8")).hexdigest(),
"public_outputs": task["public_outputs"],
"privacy_level": "derived_summary",
}
)
elif payload_type == "task_complete" and current_task:
final = str(payload.get("last_agent_message") or "")
public_refs = extract_public_refs(final)
turns.append(
{
"project": PROJECT_NAME,
"thread_id_hash": short_hash(thread_id),
"source_session_digest": source_digest[:16],
"turn_index": task_sequence,
"task_id": current_task,
"timestamp": timestamp,
"role": "assistant",
"summary": redact_text(final[:360]).strip(),
"message_sha256": hashlib.sha256(final.encode("utf-8")).hexdigest(),
"public_outputs": public_refs,
"privacy_level": "redacted_excerpt",
}
)
elif payload_type in {"function_call", "custom_tool_call"}:
name = str(payload.get("name") or "unknown")
arguments = str(payload.get("arguments") or payload.get("input") or "")
category, summary = tool_summary(name, arguments)
call_id = str(payload.get("call_id") or f"event-{event_index}")
counters[f"tool.{name}"] += 1
event = {
"project": PROJECT_NAME,
"thread_id_hash": short_hash(thread_id),
"source_session_digest": source_digest[:16],
"event_index": event_index,
"task_id": current_task,
"timestamp": timestamp,
"event_type": "tool_call",
"tool_name": name,
"tool_category": category,
"summary": summary,
"call_id_hash": short_hash(call_id),
"exit_code": None,
"success": None,
"public_refs": extract_public_refs(arguments),
"privacy_level": "sanitized_metadata",
}
tool_call_by_id[call_id] = event
tool_events.append(event)
elif payload_type in {"function_call_output", "custom_tool_call_output", "patch_apply_end"}:
call_id = str(payload.get("call_id") or f"event-{event_index}")
output = str(payload.get("output") or payload.get("stdout") or "")
success = payload.get("success")
exit_code = extract_exit_code(output)
if success is None and exit_code is not None:
success = exit_code == 0
source = tool_call_by_id.get(call_id)
event = {
"project": PROJECT_NAME,
"thread_id_hash": short_hash(thread_id),
"source_session_digest": source_digest[:16],
"event_index": event_index,
"task_id": current_task,
"timestamp": timestamp,
"event_type": "tool_result",
"tool_name": source["tool_name"] if source else str(payload_type),
"tool_category": source["tool_category"] if source else "tool",
"summary": "result captured; raw output omitted",
"call_id_hash": short_hash(call_id),
"exit_code": exit_code,
"success": success,
"public_refs": extract_public_refs(output),
"privacy_level": "sanitized_metadata",
}
tool_events.append(event)
session_summary = {
"project": PROJECT_NAME,
"thread_id_hash": short_hash(thread_id),
"source_session_digest": source_digest[:16],
"created_at": created_at,
"source_event_count": len(rows),
"derived_turn_rows": len(turns),
"derived_tool_event_rows": len(tool_events),
"task_count": len({t["task_id"] for t in turns}),
"tool_counts": dict(sorted(counters.items())),
"raw_log_policy": "Raw Codex logs were not published. This dataset contains derived, redacted metadata and short summaries only.",
}
out_dir.mkdir(parents=True, exist_ok=True)
for generated_child in ("data", "metadata", "traces"):
child = out_dir / generated_child
if child.exists():
shutil.rmtree(child)
schema_path = out_dir / "schema.json"
if schema_path.exists():
schema_path.unlink()
redaction_report = write_official_codex_trace(rollout_path, out_dir)
session_summary["official_trace_path"] = redaction_report["trace_path"]
session_summary["official_trace_format"] = redaction_report["format"]
session_summary["raw_log_policy"] = "The published trace preserves Codex JSONL event schema for the official Hugging Face Agent Trace Viewer, with public redaction applied to local paths, token-shaped strings, and secret-label strings."
metadata_dir = out_dir / "metadata"
metadata_dir.mkdir(parents=True, exist_ok=True)
write_json(metadata_dir / "session_summary.json", session_summary)
write_json(metadata_dir / "public_artifacts.json", PUBLIC_ARTIFACTS)
write_json(metadata_dir / "derived_turns.json", turns)
write_json(metadata_dir / "redaction_report.json", redaction_report)
(out_dir / "README.md").write_text(dataset_card(session_summary, collection_url), encoding="utf-8")
def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
path.write_text(
"".join(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n" for row in rows),
encoding="utf-8",
)
def write_json(path: Path, value: Any) -> None:
path.write_text(json.dumps(value, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
def codex_trace_relative_path(rollout_path: Path) -> Path:
parts = rollout_path.parts
if ".codex" in parts and "sessions" in parts:
session_root_index = parts.index("sessions")
return Path("traces", *parts[session_root_index + 1 :])
return Path("traces", rollout_path.name)
def write_official_codex_trace(rollout_path: Path, out_dir: Path) -> dict[str, Any]:
destination = out_dir / codex_trace_relative_path(rollout_path)
destination.parent.mkdir(parents=True, exist_ok=True)
counts: Counter[str] = Counter()
written = 0
with rollout_path.open("r", encoding="utf-8") as source, destination.open("w", encoding="utf-8") as target:
for line in source:
if not line.strip():
continue
event = json.loads(line)
target.write(json.dumps(redact_value(event, counts), ensure_ascii=False, sort_keys=False) + "\n")
written += 1
return {
"format": "codex-session-jsonl",
"hub_support": "Official Hugging Face Agent Traces viewer format.",
"trace_path": str(destination.relative_to(out_dir)),
"source_sha256": hashlib.sha256(rollout_path.read_bytes()).hexdigest(),
"released_sha256": hashlib.sha256(destination.read_bytes()).hexdigest(),
"source_lines": sum(1 for line in rollout_path.read_text(encoding="utf-8").splitlines() if line.strip()),
"released_lines": written,
"redaction_policy": "Preserve Codex JSONL event schema; redact local absolute paths, token-shaped values, and secret-label strings before public release.",
"redaction_counts": dict(sorted(counts.items())),
"official_docs": HF_AGENT_TRACES_DOCS,
"official_changelog": HF_AGENT_TRACE_CHANGELOG,
}
def dataset_card(summary: dict[str, Any], collection_url: str | None = None) -> str:
generated_at = datetime.now(timezone.utc).isoformat()
collection_url = collection_url or COLLECTION_URL
collection_line = f"- Collection: {collection_url}\n" if collection_url else ""
return f"""---
license: mit
pretty_name: Lost & Found Desk Codex Trace Dataset
tags:
- agent-traces
- codex
- build-small-hackathon
- lost-found-desk
- gradio
language:
- en
- zh
size_categories:
- n<1K
---
# Lost & Found Desk Codex Trace Dataset
This dataset is an official-format Codex trace artifact for the Codex-assisted Build Small hackathon submission of **Lost & Found Desk**.
It follows the Hugging Face Agent Traces guidance: Codex sessions are published as JSONL files under `traces/`, preserving the Codex session event schema so the Hub trace viewer can open the session. For public release, the trace is redacted in-place: local absolute paths, token-shaped strings, and secret-label strings are replaced while keeping the JSONL structure intact.
## Public Project Links
- Space: {SPACE_REPO}
- Live app: {APP_URL}
- Article: {ARTICLE_URL}
- Demo video: {DEMO_URL}
- GitHub: {PROJECT_REPO}
- X post: {X_POST_URL}
- Trace dataset: {DATASET_URL}
{collection_line}
## Files
- `{summary["official_trace_path"]}`: Codex-compatible JSONL session trace for the Hugging Face Agent Trace Viewer.
- `metadata/session_summary.json`: aggregate counts and source digest.
- `metadata/public_artifacts.json`: public resources connected to the submission.
- `metadata/derived_turns.json`: compact task-level summaries for quick review.
- `metadata/redaction_report.json`: source/released digests and redaction counts.
## Provenance
- Generated at: `{generated_at}`
- Source digest prefix: `{summary["source_session_digest"]}`
- Source event count: `{summary["source_event_count"]}`
- Derived turn rows: `{summary["derived_turn_rows"]}`
- Derived tool-event rows: `{summary["derived_tool_event_rows"]}`
- Raw log policy: {summary["raw_log_policy"]}
- Official trace docs: {HF_AGENT_TRACES_DOCS}
- Changelog: {HF_AGENT_TRACE_CHANGELOG}
## Intended Use
The dataset provides viewer-compatible public evidence of the Codex-assisted submission workflow: requirement audit, deployment hardening, documentation, social/demo link updates, article revision, trace publication, collection creation, and verification. It is suitable for judging, reproducibility review, and lightweight study of agent-assisted release preparation.
## Limitations
This is a public redacted trace, not a private forensic archive. It should not be used to reconstruct local machine state or secrets. The redaction preserves the Codex event schema but may replace private path and secret-marker text inside prompts, tool inputs, or tool outputs.
"""
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--rollout", type=Path, required=True)
parser.add_argument("--out", type=Path, default=Path("codex_trace_dataset"))
parser.add_argument("--collection-url", default=None)
args = parser.parse_args()
build_dataset(args.rollout.expanduser(), args.out, args.collection_url)
if __name__ == "__main__":
main()