|
|
|
|
|
|
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
REQUIRED_TOP_LEVEL = {"input_envelope", "output"} |
|
|
REQUIRED_OUTPUT_KEYS = {"glyphic", "realized"} |
|
|
|
|
|
|
|
|
def validate_jsonl(path: Path): |
|
|
errors = 0 |
|
|
total = 0 |
|
|
|
|
|
with path.open("r", encoding="utf-8") as f: |
|
|
for line_num, line in enumerate(f, start=1): |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
total += 1 |
|
|
try: |
|
|
obj = json.loads(line) |
|
|
except json.JSONDecodeError as e: |
|
|
print(f"[error] line {line_num}: invalid JSON: {e}") |
|
|
errors += 1 |
|
|
continue |
|
|
|
|
|
missing = REQUIRED_TOP_LEVEL - obj.keys() |
|
|
if missing: |
|
|
print(f"[error] line {line_num}: missing top-level keys: {missing}") |
|
|
errors += 1 |
|
|
continue |
|
|
|
|
|
out = obj["output"] |
|
|
missing_out = REQUIRED_OUTPUT_KEYS - out.keys() |
|
|
if missing_out: |
|
|
print(f"[error] line {line_num}: missing output keys: {missing_out}") |
|
|
errors += 1 |
|
|
|
|
|
|
|
|
if not isinstance(obj["input_envelope"], str): |
|
|
print(f"[error] line {line_num}: input_envelope must be a string") |
|
|
errors += 1 |
|
|
|
|
|
print(f"Validated {total} samples, errors: {errors}") |
|
|
|
|
|
|