Spaces:
Sleeping
Sleeping
| """Build the 320x320 CheXVision dataset from the raw NIH source repo.""" | |
| from __future__ import annotations | |
| import hashlib | |
| import io | |
| import os | |
| import shutil | |
| import zipfile | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from src.utils.hub import configure_hf_runtime | |
| SOURCE_REPO = "alkzar90/NIH-Chest-X-ray-dataset" | |
| TARGET_REPO = "arudaev/chest-xray-14-320" | |
| NUM_ZIPS = 12 | |
| TARGET_SIZE = (320, 320) | |
| IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg") | |
| DOWNLOAD_HEADROOM_BYTES = 256 * 1024 * 1024 | |
| EXPECTED_SPLIT_COUNTS = { | |
| "train": 77_967, | |
| "validation": 8_557, | |
| "test": 25_596, | |
| } | |
| MAX_UNREADABLE_SAMPLES = 25 | |
| MAX_UNKNOWN_SAMPLES = 20 | |
| class PipelineConfig: | |
| """Runtime configuration for the resize-and-publish pipeline.""" | |
| source_repo: str = SOURCE_REPO | |
| target_repo: str = TARGET_REPO | |
| num_zips: int = NUM_ZIPS | |
| target_size: tuple[int, int] = TARGET_SIZE | |
| max_zips: int | None = None | |
| max_images_per_zip: int | None = None | |
| skip_upload: bool = False | |
| is_kaggle: bool = False | |
| work_dir: Path = Path(".codex_tmp/resize_320") | |
| class PipelineStats: | |
| """Accumulated counters and diagnostics for the run.""" | |
| split_counts: dict[str, int] = field( | |
| default_factory=lambda: {"train": 0, "validation": 0, "test": 0} | |
| ) | |
| total_images: int = 0 | |
| unreadable_total: int = 0 | |
| unknown_total: int = 0 | |
| unreadable_samples: list[tuple[str, str, str]] = field(default_factory=list) | |
| unknown_samples: list[str] = field(default_factory=list) | |
| parquet_bytes: int = 0 | |
| written_parquets: list[str] = field(default_factory=list) | |
| def _parse_bool_env(name: str) -> bool: | |
| """Parse a boolean environment variable with common truthy forms.""" | |
| value = os.environ.get(name, "").strip().lower() | |
| return value in {"1", "true", "yes", "on"} | |
| def _parse_int_env(name: str) -> int | None: | |
| """Parse a positive integer environment variable when present.""" | |
| value = os.environ.get(name, "").strip() | |
| if not value: | |
| return None | |
| parsed = int(value) | |
| if parsed <= 0: | |
| raise RuntimeError(f"{name} must be a positive integer, got {value!r}.") | |
| return parsed | |
| def _default_work_dir(is_kaggle: bool) -> Path: | |
| """Choose a stable work directory for Kaggle runs and local smoke tests.""" | |
| if is_kaggle: | |
| return Path("/kaggle/working/chexvision_resize_320") | |
| return Path(".codex_tmp/resize_320") | |
| def build_config() -> PipelineConfig: | |
| """Construct runtime configuration from environment variables.""" | |
| is_kaggle = bool(os.environ.get("KAGGLE_KERNEL_RUN_TYPE")) | |
| max_zips = _parse_int_env("CHEXVISION_MAX_ZIPS") | |
| max_images_per_zip = _parse_int_env("CHEXVISION_MAX_IMAGES_PER_ZIP") | |
| skip_upload = _parse_bool_env("CHEXVISION_SKIP_UPLOAD") | |
| if (max_zips or max_images_per_zip) and not skip_upload: | |
| raise RuntimeError( | |
| "Debug limits require CHEXVISION_SKIP_UPLOAD=1 to avoid publishing a " | |
| "partial dataset." | |
| ) | |
| return PipelineConfig( | |
| max_zips=max_zips, | |
| max_images_per_zip=max_images_per_zip, | |
| skip_upload=skip_upload, | |
| is_kaggle=is_kaggle, | |
| work_dir=_default_work_dir(is_kaggle), | |
| ) | |
| def dataset_features(): | |
| """Return the feature schema for the uploaded data-only dataset.""" | |
| from datasets import Features, Image, Value | |
| return Features( | |
| { | |
| "image": Image(), | |
| "labels": Value("string"), | |
| "filename": Value("string"), | |
| } | |
| ) | |
| def stable_bucket(filename: str) -> int: | |
| """Deterministically bucket a filename for the train/validation split.""" | |
| digest = hashlib.sha256(filename.encode("utf-8")).digest() | |
| return int.from_bytes(digest[:8], "big") % 10 | |
| def split_for(filename: str, train_val_files: set[str], test_files: set[str]) -> str: | |
| """Map a filename to train, validation, test, or unknown.""" | |
| if filename in test_files: | |
| return "test" | |
| if filename in train_val_files: | |
| return "train" if stable_bucket(filename) < 9 else "validation" | |
| return "unknown" | |
| def normalize_label(raw_label: object) -> str: | |
| """Convert label values into the canonical pipe-delimited string form.""" | |
| if not isinstance(raw_label, str): | |
| return "No Finding" | |
| label = raw_label.strip() | |
| return label or "No Finding" | |
| def verify_split_contract(train_val_files: set[str], test_files: set[str]) -> dict[str, int]: | |
| """Validate the split contract used by the live 224 dataset.""" | |
| counts = {"train": 0, "validation": 0, "test": len(test_files)} | |
| for filename in train_val_files: | |
| counts["train" if stable_bucket(filename) < 9 else "validation"] += 1 | |
| if counts != EXPECTED_SPLIT_COUNTS: | |
| raise RuntimeError( | |
| "Source split contract changed unexpectedly. " | |
| f"Expected {EXPECTED_SPLIT_COUNTS}, got {counts}." | |
| ) | |
| return counts | |
| def prepare_local_work_dir(config: PipelineConfig) -> None: | |
| """Reset local staging paths before a new run or smoke test.""" | |
| if config.work_dir.exists(): | |
| shutil.rmtree(config.work_dir) | |
| (config.work_dir / "data").mkdir(parents=True, exist_ok=True) | |
| def local_download_dir(config: PipelineConfig, name: str) -> Path: | |
| """Return a dedicated local download folder inside the pipeline work dir.""" | |
| return config.work_dir / name | |
| def ensure_disk_headroom( | |
| download_dir: Path, | |
| *, | |
| repo_id: str, | |
| filename: str, | |
| repo_type: str, | |
| token: str, | |
| ) -> None: | |
| """Fail early when the current filesystem cannot fit the next download.""" | |
| from huggingface_hub import hf_hub_download | |
| download_dir.mkdir(parents=True, exist_ok=True) | |
| try: | |
| dry_run_info = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| repo_type=repo_type, | |
| token=token, | |
| dry_run=True, | |
| ) | |
| except Exception as exc: | |
| print( | |
| f"[resize] Could not preflight {filename} disk usage " | |
| f"({type(exc).__name__}: {exc}); continuing." | |
| ) | |
| return | |
| file_size = int(getattr(dry_run_info, "file_size", 0) or 0) | |
| if file_size <= 0: | |
| return | |
| free_bytes = shutil.disk_usage(download_dir).free | |
| required_bytes = file_size + DOWNLOAD_HEADROOM_BYTES | |
| if free_bytes < required_bytes: | |
| raise RuntimeError( | |
| "Not enough free disk space to continue the resize_320 pipeline. " | |
| f"Need at least {format_bytes(required_bytes)} free before downloading " | |
| f"{filename}, but only {format_bytes(free_bytes)} is available in " | |
| f"{download_dir}." | |
| ) | |
| print( | |
| f"[resize] Disk preflight for {filename}: need ~{format_bytes(file_size)} + " | |
| f"{format_bytes(DOWNLOAD_HEADROOM_BYTES)} headroom, " | |
| f"free={format_bytes(free_bytes)}" | |
| ) | |
| def download_source_file( | |
| config: PipelineConfig, | |
| *, | |
| token: str, | |
| filename: str, | |
| download_name: str, | |
| ) -> Path: | |
| """Download one source file into a local throwaway folder we fully control.""" | |
| from huggingface_hub import hf_hub_download | |
| download_dir = local_download_dir(config, download_name) | |
| ensure_disk_headroom( | |
| download_dir, | |
| repo_id=config.source_repo, | |
| filename=filename, | |
| repo_type="dataset", | |
| token=token, | |
| ) | |
| return Path( | |
| hf_hub_download( | |
| repo_id=config.source_repo, | |
| filename=filename, | |
| repo_type="dataset", | |
| token=token, | |
| local_dir=str(download_dir), | |
| ) | |
| ) | |
| def cleanup_local_download_dir(config: PipelineConfig, name: str) -> None: | |
| """Delete a local download folder and its `.cache/huggingface` metadata.""" | |
| shutil.rmtree(local_download_dir(config, name), ignore_errors=True) | |
| def download_source_metadata(config: PipelineConfig, token: str): | |
| """Fetch source labels and split manifests from the HF Hub.""" | |
| import pandas as pd | |
| metadata_dir_name = "source_metadata" | |
| csv_path = download_source_file( | |
| config, | |
| token=token, | |
| filename="data/Data_Entry_2017_v2020.csv", | |
| download_name=metadata_dir_name, | |
| ) | |
| labels_df = pd.read_csv(csv_path) | |
| label_map = { | |
| row["Image Index"]: normalize_label(row["Finding Labels"]) | |
| for _, row in labels_df.iterrows() | |
| } | |
| train_val_path = download_source_file( | |
| config, | |
| token=token, | |
| filename="data/train_val_list.txt", | |
| download_name=metadata_dir_name, | |
| ) | |
| with open(train_val_path, encoding="utf-8") as handle: | |
| train_val_files = {line.strip() for line in handle if line.strip()} | |
| test_path = download_source_file( | |
| config, | |
| token=token, | |
| filename="data/test_list.txt", | |
| download_name=metadata_dir_name, | |
| ) | |
| with open(test_path, encoding="utf-8") as handle: | |
| test_files = {line.strip() for line in handle if line.strip()} | |
| return label_map, train_val_files, test_files | |
| def ensure_clean_target_repo(api, config: PipelineConfig) -> None: | |
| """Create the target repo if needed and remove stale data-only artifacts.""" | |
| from huggingface_hub import CommitOperationDelete | |
| api.create_repo( | |
| repo_id=config.target_repo, | |
| repo_type="dataset", | |
| private=False, | |
| exist_ok=True, | |
| ) | |
| existing_files = api.list_repo_files( | |
| repo_id=config.target_repo, | |
| repo_type="dataset", | |
| ) | |
| delete_targets = [ | |
| path | |
| for path in existing_files | |
| if path == "README.md" or path == "load_dataset.py" or path.startswith("data/") | |
| ] | |
| if not delete_targets: | |
| print(f"[resize] Target repo {config.target_repo} is already clean.") | |
| return | |
| print( | |
| f"[resize] Removing {len(delete_targets)} stale file(s) from " | |
| f"{config.target_repo} ..." | |
| ) | |
| operations = [CommitOperationDelete(path_in_repo=path) for path in delete_targets] | |
| api.create_commit( | |
| repo_id=config.target_repo, | |
| repo_type="dataset", | |
| operations=operations, | |
| commit_message="Reset data-only parquet artifacts before rebuild", | |
| ) | |
| def is_supported_member(info: zipfile.ZipInfo) -> bool: | |
| """Return True when a ZIP member is an actual source image.""" | |
| if info.is_dir(): | |
| return False | |
| member_name = info.filename.replace("\\", "/") | |
| if member_name.startswith("__MACOSX/"): | |
| return False | |
| basename = Path(member_name).name | |
| if not basename or basename.startswith("._"): | |
| return False | |
| return basename.lower().endswith(IMAGE_EXTENSIONS) | |
| def resize_image_payload(data: bytes, target_size: tuple[int, int]) -> dict[str, bytes | None]: | |
| """Decode, convert, resize, and re-encode one image as PNG bytes.""" | |
| from PIL import Image as PILImage | |
| resampling = getattr(PILImage, "Resampling", PILImage) | |
| with PILImage.open(io.BytesIO(data)) as raw_image: | |
| image = raw_image.convert("RGB") | |
| image = image.resize(target_size, resampling.LANCZOS) | |
| buffer = io.BytesIO() | |
| image.save(buffer, format="PNG") | |
| return {"bytes": buffer.getvalue(), "path": None} | |
| def write_parquet_shards( | |
| buckets: dict[str, dict[str, list]], | |
| output_dir: Path, | |
| shard_index: int, | |
| shard_count: int, | |
| stats: PipelineStats, | |
| ) -> None: | |
| """Write train/validation/test Parquet shards for one source ZIP.""" | |
| from datasets import Dataset | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| features = dataset_features() | |
| for split_name, data_dict in buckets.items(): | |
| if not data_dict["filename"]: | |
| continue | |
| parquet_name = f"{split_name}-{shard_index:05d}-of-{shard_count:05d}.parquet" | |
| parquet_path = output_dir / parquet_name | |
| dataset = Dataset.from_dict(data_dict, features=features) | |
| dataset.to_parquet(str(parquet_path)) | |
| stats.parquet_bytes += parquet_path.stat().st_size | |
| stats.written_parquets.append(parquet_name) | |
| def upload_parquet_shards(api, config: PipelineConfig, parquet_dir: Path, zip_name: str) -> None: | |
| """Upload the generated Parquet shards for one source ZIP.""" | |
| api.upload_folder( | |
| folder_path=str(parquet_dir), | |
| path_in_repo="data", | |
| repo_id=config.target_repo, | |
| repo_type="dataset", | |
| commit_message=f"Add 320x320 parquet shards for {zip_name}", | |
| ) | |
| def process_zip( | |
| zip_index: int, | |
| config: PipelineConfig, | |
| token: str, | |
| label_map: dict[str, str], | |
| train_val_files: set[str], | |
| test_files: set[str], | |
| stats: PipelineStats, | |
| api=None, | |
| ) -> None: | |
| """Process one source ZIP into deterministic Parquet shard files.""" | |
| from PIL import ImageFile, UnidentifiedImageError | |
| ImageFile.LOAD_TRUNCATED_IMAGES = True | |
| zip_name = f"images_{zip_index:03d}.zip" | |
| remote_path = f"data/images/{zip_name}" | |
| shard_index = zip_index - 1 | |
| download_dir_name = "source_zip" | |
| print(f"[resize] Processing {zip_name} ({zip_index}/{config.num_zips}) ...") | |
| zip_path = download_source_file( | |
| config, | |
| token=token, | |
| filename=remote_path, | |
| download_name=download_dir_name, | |
| ) | |
| print(f"[resize] Downloaded {remote_path}") | |
| buckets: dict[str, dict[str, list]] = { | |
| "train": {"image": [], "labels": [], "filename": []}, | |
| "validation": {"image": [], "labels": [], "filename": []}, | |
| "test": {"image": [], "labels": [], "filename": []}, | |
| } | |
| zip_unreadable = 0 | |
| zip_unknown = 0 | |
| try: | |
| with zipfile.ZipFile(zip_path, "r") as archive: | |
| members = [info for info in archive.infolist() if is_supported_member(info)] | |
| if config.max_images_per_zip is not None: | |
| members = members[: config.max_images_per_zip] | |
| print(f"[resize] Found {len(members):,} candidate image entries in {zip_name}") | |
| for member_index, info in enumerate(members, start=1): | |
| filename = Path(info.filename).name | |
| split = split_for(filename, train_val_files, test_files) | |
| if split == "unknown": | |
| zip_unknown += 1 | |
| stats.unknown_total += 1 | |
| if len(stats.unknown_samples) < MAX_UNKNOWN_SAMPLES: | |
| stats.unknown_samples.append(f"{zip_name}:{info.filename}") | |
| continue | |
| try: | |
| payload = archive.read(info) | |
| if not payload: | |
| raise ValueError("empty file") | |
| image_payload = resize_image_payload(payload, config.target_size) | |
| except (UnidentifiedImageError, OSError, ValueError) as exc: | |
| zip_unreadable += 1 | |
| stats.unreadable_total += 1 | |
| if len(stats.unreadable_samples) < MAX_UNREADABLE_SAMPLES: | |
| stats.unreadable_samples.append( | |
| (zip_name, info.filename, f"{type(exc).__name__}: {exc}") | |
| ) | |
| continue | |
| buckets[split]["image"].append(image_payload) | |
| buckets[split]["labels"].append(label_map.get(filename, "No Finding")) | |
| buckets[split]["filename"].append(filename) | |
| stats.split_counts[split] += 1 | |
| stats.total_images += 1 | |
| if member_index % 2000 == 0: | |
| print( | |
| f"[resize] scanned {member_index:,}/{len(members):,} entries " | |
| f"in {zip_name}" | |
| ) | |
| print( | |
| "[resize] Zip summary: " | |
| f"train={len(buckets['train']['filename']):,}, " | |
| f"validation={len(buckets['validation']['filename']):,}, " | |
| f"test={len(buckets['test']['filename']):,}, " | |
| f"skipped_unreadable={zip_unreadable:,}, " | |
| f"skipped_unknown={zip_unknown:,}" | |
| ) | |
| if config.skip_upload: | |
| parquet_dir = config.work_dir / "data" | |
| write_parquet_shards( | |
| buckets=buckets, | |
| output_dir=parquet_dir, | |
| shard_index=shard_index, | |
| shard_count=config.num_zips, | |
| stats=stats, | |
| ) | |
| else: | |
| temp_dir = config.work_dir / f"tmp_{zip_name}" | |
| if temp_dir.exists(): | |
| shutil.rmtree(temp_dir) | |
| temp_dir.mkdir(parents=True, exist_ok=True) | |
| try: | |
| write_parquet_shards( | |
| buckets=buckets, | |
| output_dir=temp_dir, | |
| shard_index=shard_index, | |
| shard_count=config.num_zips, | |
| stats=stats, | |
| ) | |
| upload_parquet_shards(api, config, temp_dir, zip_name) | |
| finally: | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| finally: | |
| cleanup_local_download_dir(config, download_dir_name) | |
| print(f"[resize] Deleted local download dir for {zip_name}") | |
| def format_bytes(num_bytes: int) -> str: | |
| """Render bytes with a compact binary unit suffix.""" | |
| size = float(num_bytes) | |
| for unit in ("B", "KB", "MB", "GB", "TB"): | |
| if size < 1024.0 or unit == "TB": | |
| return f"{size:.1f} {unit}" | |
| size /= 1024.0 | |
| return f"{num_bytes} B" | |
| def render_readme(stats: PipelineStats, config: PipelineConfig) -> str: | |
| """Render a data-only dataset card with accurate split counts and schema.""" | |
| limit_note = "" | |
| if config.max_zips or config.max_images_per_zip: | |
| limit_note = ( | |
| "This card reflects a debug smoke-test run because one or more " | |
| "CHEXVISION_MAX_* limits were enabled." | |
| ) | |
| front_matter = "\n".join( | |
| [ | |
| "---", | |
| "dataset_info:", | |
| " features:", | |
| " - name: image", | |
| " dtype: image", | |
| " - name: labels", | |
| " dtype: string", | |
| " - name: filename", | |
| " dtype: string", | |
| " splits:", | |
| " - name: train", | |
| f" num_examples: {stats.split_counts['train']}", | |
| " - name: validation", | |
| f" num_examples: {stats.split_counts['validation']}", | |
| " - name: test", | |
| f" num_examples: {stats.split_counts['test']}", | |
| "configs:", | |
| " - config_name: default", | |
| " data_files:", | |
| " - split: train", | |
| " path: data/train-*.parquet", | |
| " - split: validation", | |
| " path: data/validation-*.parquet", | |
| " - split: test", | |
| " path: data/test-*.parquet", | |
| "task_categories:", | |
| " - image-classification", | |
| "task_ids:", | |
| " - multi-label-image-classification", | |
| "language:", | |
| " - en", | |
| "tags:", | |
| " - medical", | |
| " - chest-xray", | |
| " - radiology", | |
| " - deep-learning", | |
| " - big-data", | |
| " - parquet", | |
| " - data-only", | |
| "license: unknown", | |
| "size_categories:", | |
| " - 100K<n<1M", | |
| "pretty_name: NIH Chest X-ray14 (320x320, Processed for CheXVision)", | |
| "---", | |
| ] | |
| ) | |
| body_lines = [ | |
| "# NIH Chest X-ray14 - 320x320 Processed for CheXVision", | |
| "", | |
| "This dataset repackages the raw NIH Chest X-ray14 source dataset from", | |
| f"[{config.source_repo}](https://huggingface.co/datasets/{config.source_repo})", | |
| "into a data-only Parquet dataset for the CheXVision project.", | |
| "", | |
| "## Dataset Summary", | |
| "", | |
| "- Source format: 12 ZIP archives of original chest X-ray images plus CSV manifests", | |
| "- Output format: data-only Parquet shards under `data/`", | |
| "- Resolution: 320x320 RGB", | |
| "- Columns: `image`, `labels`, `filename`", | |
| "- Split contract: `train`, `validation`, `test`", | |
| ( | |
| "- Approximate local Parquet size produced in this run: " | |
| f"`{format_bytes(stats.parquet_bytes)}`" | |
| ), | |
| ] | |
| if limit_note: | |
| body_lines.extend(["", limit_note, ""]) | |
| else: | |
| body_lines.append("") | |
| body_lines.extend( | |
| [ | |
| "## Splits", | |
| "", | |
| "| Split | Images |", | |
| "|-------|-------:|", | |
| f"| Train | {stats.split_counts['train']:,} |", | |
| f"| Validation | {stats.split_counts['validation']:,} |", | |
| f"| Test | {stats.split_counts['test']:,} |", | |
| "", | |
| "## Schema", | |
| "", | |
| "- `image`: 320x320 RGB image payload", | |
| "- `labels`: pipe-delimited pathology labels, or `No Finding`", | |
| "- `filename`: original NIH image filename", | |
| "", | |
| "## Processing Notes", | |
| "", | |
| "- Source split manifests come from `train_val_list.txt` and `test_list.txt`", | |
| "- Validation membership uses the same stable hash-bucket logic as the live", | |
| " `arudaev/chest-xray-14` dataset", | |
| "- Hidden `__MACOSX` ZIP entries and non-image members are ignored", | |
| "- Truncated-but-readable images are kept; truly unreadable files are skipped", | |
| "- This repo intentionally ships no `load_dataset.py` script so it remains a", | |
| " data-only dataset that works with the modern HF dataset viewer", | |
| "", | |
| "## Usage", | |
| "", | |
| "```python", | |
| "from datasets import load_dataset", | |
| "", | |
| f'dataset = load_dataset("{config.target_repo}")', | |
| "print(dataset)", | |
| "```", | |
| "", | |
| "## Provenance", | |
| "", | |
| "Built by the Kaggle kernel `hlexnc/chexvision-resize-320` for the", | |
| "[CheXVision](https://github.com/arudaev/chexvision) project.", | |
| ] | |
| ) | |
| body = "\n".join(body_lines) | |
| return f"{front_matter}\n\n{body}\n" | |
| def write_or_upload_readme(api, config: PipelineConfig, readme_text: str) -> None: | |
| """Persist the generated dataset card locally or upload it to the HF repo.""" | |
| if config.skip_upload: | |
| readme_path = config.work_dir / "README.md" | |
| readme_path.write_text(readme_text, encoding="utf-8") | |
| print(f"[resize] Wrote local README to {readme_path}") | |
| return | |
| api.upload_file( | |
| path_or_fileobj=io.BytesIO(readme_text.encode("utf-8")), | |
| path_in_repo="README.md", | |
| repo_id=config.target_repo, | |
| repo_type="dataset", | |
| commit_message="Update dataset card for 320x320 data-only parquet release", | |
| ) | |
| def validate_final_counts(stats: PipelineStats, config: PipelineConfig) -> None: | |
| """Enforce the expected full-run split counts in production mode.""" | |
| if config.max_zips or config.max_images_per_zip: | |
| print("[resize] Debug limits enabled; skipping final full-dataset count check.") | |
| return | |
| if stats.split_counts != EXPECTED_SPLIT_COUNTS: | |
| raise RuntimeError( | |
| "Final split counts do not match the expected live 224 contract. " | |
| f"Expected {EXPECTED_SPLIT_COUNTS}, got {stats.split_counts}." | |
| ) | |
| def run_resize_320_pipeline(config: PipelineConfig | None = None) -> PipelineStats: | |
| """Execute the full resize-and-publish pipeline.""" | |
| from huggingface_hub import HfApi | |
| config = config or build_config() | |
| hf_token = configure_hf_runtime(required_token=True, check_dns=True) | |
| if hf_token is None: | |
| raise RuntimeError("HF token resolution failed unexpectedly.") | |
| print( | |
| "[resize] Config: " | |
| f"target_repo={config.target_repo}, " | |
| f"target_size={config.target_size}, " | |
| f"max_zips={config.max_zips}, " | |
| f"max_images_per_zip={config.max_images_per_zip}, " | |
| f"skip_upload={config.skip_upload}" | |
| ) | |
| print("[resize] Downloading source metadata ...") | |
| label_map, train_val_files, test_files = download_source_metadata(config, hf_token) | |
| manifest_counts = verify_split_contract(train_val_files, test_files) | |
| print( | |
| "[resize] Verified split contract: " | |
| f"train={manifest_counts['train']:,}, " | |
| f"validation={manifest_counts['validation']:,}, " | |
| f"test={manifest_counts['test']:,}" | |
| ) | |
| print(f"[resize] Loaded {len(label_map):,} label rows.") | |
| prepare_local_work_dir(config) | |
| api = None | |
| if config.skip_upload: | |
| print("[resize] CHEXVISION_SKIP_UPLOAD=1 - local smoke-test mode enabled.") | |
| else: | |
| api = HfApi(token=hf_token) | |
| ensure_clean_target_repo(api, config) | |
| stats = PipelineStats() | |
| zip_limit = min(config.max_zips or config.num_zips, config.num_zips) | |
| for zip_index in range(1, zip_limit + 1): | |
| process_zip( | |
| zip_index=zip_index, | |
| config=config, | |
| token=hf_token, | |
| label_map=label_map, | |
| train_val_files=train_val_files, | |
| test_files=test_files, | |
| stats=stats, | |
| api=api, | |
| ) | |
| validate_final_counts(stats, config) | |
| readme_text = render_readme(stats, config) | |
| write_or_upload_readme(api, config, readme_text) | |
| print("[resize] Run complete.") | |
| print( | |
| "[resize] Final counts: " | |
| f"train={stats.split_counts['train']:,}, " | |
| f"validation={stats.split_counts['validation']:,}, " | |
| f"test={stats.split_counts['test']:,}" | |
| ) | |
| print(f"[resize] Images kept: {stats.total_images:,}") | |
| print(f"[resize] Unreadable images skipped: {stats.unreadable_total:,}") | |
| print(f"[resize] Unknown split entries skipped: {stats.unknown_total:,}") | |
| print(f"[resize] Local parquet footprint: {format_bytes(stats.parquet_bytes)}") | |
| if not config.skip_upload: | |
| print(f"[resize] Dataset repo: https://huggingface.co/datasets/{config.target_repo}") | |
| if stats.unreadable_samples: | |
| print("[resize] Sample unreadable entries:") | |
| for zip_name, member, reason in stats.unreadable_samples: | |
| print(f"[resize] {zip_name}:{member} -> {reason}") | |
| if stats.unknown_samples: | |
| print("[resize] Sample unknown split entries:") | |
| for sample in stats.unknown_samples: | |
| print(f"[resize] {sample}") | |
| return stats | |
| def main() -> None: | |
| """CLI entry point for local smoke tests and debugging.""" | |
| run_resize_320_pipeline() | |
| if __name__ == "__main__": | |
| main() | |