noticecheck / traces /scripts /validate_traces.py
kingabzpro's picture
Improve privacy-safe trace quality
e4f211a
Raw
History Blame Contribute Delete
1.95 kB
"""Validate privacy-safe trace JSONL files."""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[2]
TRACE_DIR = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from traces.runtime import validate_trace
def validate_file(path: Path) -> tuple[int, list[str]]:
count = 0
errors: list[str] = []
trace_ids: set[str] = set()
for line_number, line in enumerate(
path.read_text(encoding="utf-8").splitlines(),
start=1,
):
if not line.strip():
continue
count += 1
try:
record = json.loads(line)
except json.JSONDecodeError as exc:
errors.append(f"{path}:{line_number}: invalid JSON: {exc}")
continue
trace_id = record.get("trace_id") if isinstance(record, dict) else None
if trace_id in trace_ids:
errors.append(f"{path}:{line_number}: Duplicate trace ID: {trace_id}")
elif isinstance(trace_id, str):
trace_ids.add(trace_id)
for error in validate_trace(record):
errors.append(f"{path}:{line_number}: {error}")
return count, errors
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("paths", nargs="*", type=Path)
args = parser.parse_args()
paths = args.paths or [TRACE_DIR / "data" / "trace_samples.jsonl"]
total = 0
all_errors: list[str] = []
for path in paths:
if not path.exists():
all_errors.append(f"{path}: file does not exist")
continue
count, errors = validate_file(path)
total += count
all_errors.extend(errors)
if all_errors:
print("\n".join(all_errors), file=sys.stderr)
return 1
print(f"Validated {total} trace records.")
return 0
if __name__ == "__main__":
raise SystemExit(main())