Spaces:
Running on Zero
Running on Zero
| import json | |
| from pathlib import Path | |
| from scripts.publish_codex_trace_dataset import RedactionResult, TextCaps, build_dataset | |
| class FakePrivacyRedactor: | |
| def redact_many(self, texts: list[str]) -> list[RedactionResult]: | |
| results: list[RedactionResult] = [] | |
| for text in texts: | |
| count = text.count("Alice Smith") + text.count("alice@example.com") | |
| redacted = text.replace("Alice Smith", "[PRIVATE_PERSON]") | |
| redacted = redacted.replace("alice@example.com", "[PRIVATE_EMAIL]") | |
| labels = {"PRIVATE": count} if count else {} | |
| results.append(RedactionResult(text=redacted, count=count, labels=labels)) | |
| return results | |
| def write_jsonl(path: Path, records: list[dict]) -> None: | |
| path.write_text( | |
| "\n".join(json.dumps(record, ensure_ascii=False) for record in records) + "\n", | |
| encoding="utf-8", | |
| ) | |
| def test_codex_trace_dataset_selects_minimizes_and_redacts(tmp_path: Path) -> None: | |
| project_root = tmp_path / "hackathon-advisor" | |
| project_root.mkdir() | |
| session_root = tmp_path / "sessions" | |
| session_root.mkdir() | |
| session_file = session_root / "rollout-test.jsonl" | |
| home_secret_path = str(Path.home() / "Documents" / "private-note.txt") | |
| token = "hf_" + "a" * 24 | |
| write_jsonl( | |
| session_file, | |
| [ | |
| { | |
| "type": "session_meta", | |
| "timestamp": "2026-06-08T00:00:00Z", | |
| "payload": { | |
| "id": "session-1", | |
| "cwd": str(project_root), | |
| "originator": "Codex Desktop", | |
| "base_instructions": {"do_not_publish": True}, | |
| "dynamic_tools": ["internal"], | |
| "git": {"repository_url": "https://github.com/example/hackathon-advisor.git"}, | |
| }, | |
| }, | |
| { | |
| "type": "turn_context", | |
| "timestamp": "2026-06-08T00:00:01Z", | |
| "payload": { | |
| "turn_id": "turn-1", | |
| "cwd": str(project_root), | |
| "workspace_roots": [str(project_root)], | |
| "collaboration_mode": {"mode": "default", "settings": "internal"}, | |
| }, | |
| }, | |
| { | |
| "type": "event_msg", | |
| "timestamp": "2026-06-08T00:00:02Z", | |
| "payload": { | |
| "type": "user_message", | |
| "turn_id": "turn-1", | |
| "message": ( | |
| f"Help Alice Smith at alice@example.com using {home_secret_path} " | |
| f"and HF_TOKEN={token}" | |
| ), | |
| }, | |
| }, | |
| { | |
| "type": "response_item", | |
| "timestamp": "2026-06-08T00:00:03Z", | |
| "payload": { | |
| "type": "message", | |
| "role": "developer", | |
| "content": [{"type": "input_text", "text": "internal prompt"}], | |
| }, | |
| }, | |
| { | |
| "type": "response_item", | |
| "timestamp": "2026-06-08T00:00:04Z", | |
| "payload": { | |
| "type": "message", | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "input_text", | |
| "text": ( | |
| f"Help Alice Smith at alice@example.com using {home_secret_path} " | |
| f"and HF_TOKEN={token}" | |
| ), | |
| } | |
| ], | |
| }, | |
| }, | |
| { | |
| "type": "response_item", | |
| "timestamp": "2026-06-08T00:00:05Z", | |
| "payload": { | |
| "type": "function_call", | |
| "name": "exec_command", | |
| "arguments": json.dumps({"cmd": "pytest", "workdir": str(project_root)}), | |
| "call_id": "call-1", | |
| }, | |
| }, | |
| { | |
| "type": "response_item", | |
| "timestamp": "2026-06-08T00:00:06Z", | |
| "payload": { | |
| "type": "function_call_output", | |
| "call_id": "call-1", | |
| "output": "0123456789" * 12, | |
| }, | |
| }, | |
| { | |
| "type": "compacted", | |
| "timestamp": "2026-06-08T00:00:07Z", | |
| "payload": {"replacement_history": ["internal"]}, | |
| }, | |
| ], | |
| ) | |
| out_dir = tmp_path / "dataset" | |
| manifest = build_dataset( | |
| project_root=project_root, | |
| session_roots=[session_root], | |
| include_terms=[], | |
| out_dir=out_dir, | |
| redactor=FakePrivacyRedactor(), | |
| privacy_model_id="openai/privacy-filter", | |
| privacy_model_revision="test", | |
| privacy_device="test", | |
| min_score=0.5, | |
| record_batch_size=2, | |
| text_caps=TextCaps( | |
| message=200, | |
| tool_argument=200, | |
| tool_output=80, | |
| other=200, | |
| ), | |
| ) | |
| rows = [ | |
| json.loads(line) | |
| for line in (out_dir / "codex_sessions.jsonl").read_text(encoding="utf-8").splitlines() | |
| ] | |
| dataset_text = "\n".join(json.dumps(row, ensure_ascii=False) for row in rows) | |
| assert manifest["selected_session_count"] == 1 | |
| assert manifest["published_record_count"] == 5 | |
| assert manifest["dropped_record_count"] == 3 | |
| assert manifest["redaction_count"] == 2 | |
| assert manifest["truncated_field_count"] == 1 | |
| assert manifest["truncated_char_count"] > 0 | |
| assert len(manifest["sessions"][0]["source_sha256"]) == 64 | |
| assert all(row["session_id"] == "session-1" for row in rows) | |
| assert "$PROJECT_ROOT" in dataset_text | |
| assert str(project_root) not in dataset_text | |
| assert str(Path.home()) not in dataset_text | |
| assert token not in dataset_text | |
| assert "base_instructions" not in dataset_text | |
| assert "dynamic_tools" not in dataset_text | |
| assert "internal prompt" not in dataset_text | |
| assert "replacement_history" not in dataset_text | |
| assert "role" not in dataset_text | |
| assert "alice@example.com" not in dataset_text | |
| assert "Alice Smith" not in dataset_text | |
| assert "[PRIVATE_EMAIL]" in dataset_text | |
| assert "[PRIVATE_PERSON]" in dataset_text | |
| assert "[truncated" in dataset_text | |
| def test_build_dataset_redacts_caller_home_when_run_home_differs(tmp_path: Path, monkeypatch) -> None: | |
| # Simulates the Modal container, where Path.home() is /root rather than the user's | |
| # machine. The caller's real home must travel via path_redaction_prefixes to be redacted; | |
| # this guards the unified --location code path that passes [project, caller-home] on both lanes. | |
| project_root = tmp_path / "hackathon-advisor" | |
| project_root.mkdir() | |
| session_root = tmp_path / "sessions" | |
| session_root.mkdir() | |
| caller_home = "/home/realuser" | |
| secret_path = f"{caller_home}/Documents/private-note.txt" | |
| write_jsonl( | |
| session_root / "rollout-test.jsonl", | |
| [ | |
| { | |
| "type": "session_meta", | |
| "timestamp": "2026-06-08T00:00:00Z", | |
| "payload": { | |
| "id": "session-1", | |
| "cwd": str(project_root), | |
| "git": {"repository_url": "https://github.com/example/hackathon-advisor.git"}, | |
| }, | |
| }, | |
| { | |
| "type": "event_msg", | |
| "timestamp": "2026-06-08T00:00:01Z", | |
| "payload": { | |
| "type": "user_message", | |
| "turn_id": "turn-1", | |
| "message": f"please open {secret_path} for the hackathon-advisor project", | |
| }, | |
| }, | |
| ], | |
| ) | |
| # Container home differs from the caller's real home. | |
| monkeypatch.setattr(Path, "home", staticmethod(lambda: Path("/root"))) | |
| out_dir = tmp_path / "dataset" | |
| manifest = build_dataset( | |
| project_root=project_root, | |
| session_roots=[session_root], | |
| include_terms=[], | |
| out_dir=out_dir, | |
| redactor=FakePrivacyRedactor(), | |
| privacy_model_id="openai/privacy-filter", | |
| privacy_model_revision="test", | |
| privacy_device="test", | |
| min_score=0.5, | |
| record_batch_size=2, | |
| text_caps=TextCaps(message=200, tool_argument=200, tool_output=80, other=200), | |
| path_redaction_prefixes=[caller_home, str(project_root)], | |
| ) | |
| dataset_text = (out_dir / "codex_sessions.jsonl").read_text(encoding="utf-8") | |
| assert manifest["published_record_count"] >= 1 | |
| assert caller_home not in dataset_text | |
| assert "~/Documents/private-note.txt" in dataset_text | |