Spaces:
Sleeping
Sleeping
| """Dataset loading and validation helpers.""" | |
| from __future__ import annotations | |
| import json | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Any, Iterable | |
| from config import DATA_DIR, REPO_ROOT | |
| from data.schemas import ALLOWED_LABELS, clean_defects, label_counts | |
| TRAINING_JSONL = DATA_DIR / "training_data.jsonl" | |
| FDS_SCANS_DIR = ( | |
| DATA_DIR | |
| / "raw" | |
| / "FilmDamageSimulator" | |
| / "FilmDamageSimulator" | |
| / "scans" | |
| ) | |
| class DatasetIssue: | |
| image: str | |
| message: str | |
| def load_jsonl(path: str | Path = TRAINING_JSONL) -> list[dict[str, Any]]: | |
| path = Path(path) | |
| rows: list[dict[str, Any]] = [] | |
| if not path.exists(): | |
| return rows | |
| with path.open("r", encoding="utf-8") as f: | |
| for line_no, line in enumerate(f, start=1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| rows.append(json.loads(line)) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError(f"{path}:{line_no}: invalid JSONL row: {exc}") from exc | |
| return rows | |
| def resolve_image_path(entry: dict[str, Any], scans_dir: Path = FDS_SCANS_DIR) -> Path: | |
| image = str(entry.get("image", "")) | |
| path = Path(image) | |
| if path.is_absolute(): | |
| return path | |
| candidate = scans_dir / image | |
| if candidate.exists(): | |
| return candidate | |
| return REPO_ROOT / image | |
| def validate_entries( | |
| entries: Iterable[dict[str, Any]], | |
| *, | |
| require_images: bool = False, | |
| scans_dir: Path = FDS_SCANS_DIR, | |
| ) -> list[DatasetIssue]: | |
| issues: list[DatasetIssue] = [] | |
| for entry in entries: | |
| image = str(entry.get("image", "")) | |
| if not image: | |
| issues.append(DatasetIssue(image="(missing)", message="missing image field")) | |
| if require_images and image and not resolve_image_path(entry, scans_dir).exists(): | |
| issues.append(DatasetIssue(image=image, message="image file does not exist")) | |
| annotations = entry.get("annotations", []) | |
| cleaned, dropped = clean_defects(annotations) | |
| if dropped: | |
| issues.append( | |
| DatasetIssue( | |
| image=image or "(missing)", | |
| message=f"{dropped} invalid annotations", | |
| ) | |
| ) | |
| for defect in cleaned: | |
| if defect["label"] not in ALLOWED_LABELS: | |
| issues.append( | |
| DatasetIssue( | |
| image=image or "(missing)", | |
| message=f"unknown label {defect['label']}", | |
| ) | |
| ) | |
| return issues | |
| def dataset_summary(entries: Iterable[dict[str, Any]]) -> dict[str, Any]: | |
| entries_list = list(entries) | |
| all_defects: list[dict[str, Any]] = [] | |
| dropped = 0 | |
| sources: dict[str, int] = {} | |
| for entry in entries_list: | |
| source = str(entry.get("source", "unknown")) | |
| sources[source] = sources.get(source, 0) + 1 | |
| cleaned, bad = clean_defects(entry.get("annotations", [])) | |
| all_defects.extend(cleaned) | |
| dropped += bad | |
| counts = label_counts(all_defects) | |
| return { | |
| "images": len(entries_list), | |
| "defects": len(all_defects), | |
| "dropped_annotations": dropped, | |
| "label_counts": counts, | |
| "sources": dict(sorted(sources.items())), | |
| } | |
| def load_training_summary(path: str | Path = TRAINING_JSONL) -> dict[str, Any]: | |
| entries = load_jsonl(path) | |
| summary = dataset_summary(entries) | |
| summary["issues"] = [ | |
| issue.__dict__ for issue in validate_entries(entries, require_images=False) | |
| ] | |
| return summary | |
| __all__ = [ | |
| "DatasetIssue", | |
| "FDS_SCANS_DIR", | |
| "TRAINING_JSONL", | |
| "dataset_summary", | |
| "load_jsonl", | |
| "load_training_summary", | |
| "resolve_image_path", | |
| "validate_entries", | |
| ] | |