convitom Claude Opus 4.7 commited on
Commit ·
0a99045
1
Parent(s): 8b5bef1
fix(scripts): carry non-image files through resize + tar shards
Browse filesThe IMG_EXTS filter silently dropped reports (.txt), CheXpert labels
(.csv), and any other metadata interleaved in the source tree, so the
shipped MIMIC-CXR_resized would lose them. Add a parallel _copy_one
worker that copies non-image files verbatim, preserving the exact tree.
Manifest now records both counts. Applies to both build_resized_dataset.py
and resize_and_shard.ipynb (cell c06).
This makes the resized release a faithful mirror of MIMIC-CXR_processed
regardless of what's mixed in -- needed if the instruct JSON builder is
ever run on the training box (auto-build path reads .txt + chexpert.csv).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
scripts/build_resized_dataset.py
CHANGED
|
@@ -123,27 +123,53 @@ def _resize_one(args) -> tuple[str, str]:
|
|
| 123 |
return f"error:{type(e).__name__}: {e}", rel
|
| 124 |
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
def resize_tree(src: Path, dst: Path, target: int, quality: int,
|
| 127 |
workers: int, square: bool) -> None:
|
| 128 |
print(f"[resize] scanning {src} ...")
|
| 129 |
-
|
| 130 |
for root, _, files in os.walk(src):
|
| 131 |
for fn in files:
|
|
|
|
|
|
|
|
|
|
| 132 |
if fn.lower().endswith(IMG_EXTS):
|
| 133 |
-
sp
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
| 138 |
mode = f"square {target}x{target}" if square else f"shortest-edge {target}px"
|
| 139 |
-
print(f"[resize] {len(
|
| 140 |
-
f"({mode}, q{quality}, {workers} workers)")
|
| 141 |
|
| 142 |
-
counts = {"resized": 0, "squared": 0, "copied": 0,
|
|
|
|
| 143 |
errors: list[str] = []
|
| 144 |
with ProcessPoolExecutor(max_workers=workers) as ex:
|
| 145 |
-
futs = [ex.submit(_resize_one, j) for j in
|
| 146 |
-
|
|
|
|
| 147 |
status, rel = f.result()
|
| 148 |
if status.startswith("error:"):
|
| 149 |
counts["error"] += 1
|
|
@@ -152,13 +178,15 @@ def resize_tree(src: Path, dst: Path, target: int, quality: int,
|
|
| 152 |
counts[status] += 1
|
| 153 |
|
| 154 |
dst.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 155 |
out_bytes = sum(p.stat().st_size for p in dst.rglob("*") if p.is_file())
|
| 156 |
(dst / "_manifest.json").write_text(json.dumps({
|
| 157 |
"source": str(src), "target": target,
|
| 158 |
"mode": "square" if square else "shortest_edge",
|
| 159 |
"jpeg_quality": quality, "subsampling": "4:4:4",
|
| 160 |
"resampling": "BICUBIC" if square else "LANCZOS",
|
| 161 |
-
"counts": counts, "total":
|
|
|
|
| 162 |
"output_bytes": out_bytes,
|
| 163 |
"built_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
| 164 |
}, indent=2), encoding="utf-8")
|
|
@@ -167,7 +195,7 @@ def resize_tree(src: Path, dst: Path, target: int, quality: int,
|
|
| 167 |
print(f"[resize] WARNING: {len(errors)} failures -> {dst/'_errors.txt'}")
|
| 168 |
print(f"[resize] done: {counts}")
|
| 169 |
print(f"[resize] output size: {out_bytes / 1024**3:.2f} GB "
|
| 170 |
-
f"({out_bytes / max(1, len(
|
| 171 |
|
| 172 |
|
| 173 |
# -- Phase 2: pack into tar shards -------------------------------------------
|
|
|
|
| 123 |
return f"error:{type(e).__name__}: {e}", rel
|
| 124 |
|
| 125 |
|
| 126 |
+
def _copy_one(args) -> tuple[str, str]:
|
| 127 |
+
"""Worker: copy a non-image file verbatim, preserving the tree.
|
| 128 |
+
|
| 129 |
+
Used for reports (.txt), CheXpert labels (.csv), metadata (.json) and
|
| 130 |
+
anything else interleaved in the source tree -- so the tar shards carry
|
| 131 |
+
a complete copy of MIMIC-CXR_processed, not just images.
|
| 132 |
+
"""
|
| 133 |
+
src_path, dst_path, rel = args
|
| 134 |
+
try:
|
| 135 |
+
dst_path = Path(dst_path)
|
| 136 |
+
if dst_path.exists() and dst_path.stat().st_size > 0:
|
| 137 |
+
return "skipped", rel
|
| 138 |
+
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
| 139 |
+
shutil.copy2(src_path, dst_path)
|
| 140 |
+
return "copied_other", rel
|
| 141 |
+
except Exception as e:
|
| 142 |
+
return f"error:{type(e).__name__}: {e}", rel
|
| 143 |
+
|
| 144 |
+
|
| 145 |
def resize_tree(src: Path, dst: Path, target: int, quality: int,
|
| 146 |
workers: int, square: bool) -> None:
|
| 147 |
print(f"[resize] scanning {src} ...")
|
| 148 |
+
img_jobs, other_jobs = [], []
|
| 149 |
for root, _, files in os.walk(src):
|
| 150 |
for fn in files:
|
| 151 |
+
sp = Path(root) / fn
|
| 152 |
+
rel = sp.relative_to(src)
|
| 153 |
+
dp = dst / rel
|
| 154 |
if fn.lower().endswith(IMG_EXTS):
|
| 155 |
+
img_jobs.append((str(sp), str(dp), str(rel), target, quality, square))
|
| 156 |
+
else:
|
| 157 |
+
# non-image: reports/csv/json/etc. copied verbatim so the
|
| 158 |
+
# shipped tree mirrors the source exactly (no data loss).
|
| 159 |
+
other_jobs.append((str(sp), str(dp), str(rel)))
|
| 160 |
+
if not img_jobs and not other_jobs:
|
| 161 |
+
sys.exit(f"ERROR: nothing found under {src}")
|
| 162 |
mode = f"square {target}x{target}" if square else f"shortest-edge {target}px"
|
| 163 |
+
print(f"[resize] {len(img_jobs):,} images + {len(other_jobs):,} non-image "
|
| 164 |
+
f"-> {dst} ({mode}, q{quality}, {workers} workers)")
|
| 165 |
|
| 166 |
+
counts = {"resized": 0, "squared": 0, "copied": 0,
|
| 167 |
+
"copied_other": 0, "skipped": 0, "error": 0}
|
| 168 |
errors: list[str] = []
|
| 169 |
with ProcessPoolExecutor(max_workers=workers) as ex:
|
| 170 |
+
futs = [ex.submit(_resize_one, j) for j in img_jobs]
|
| 171 |
+
futs += [ex.submit(_copy_one, j) for j in other_jobs]
|
| 172 |
+
for f in tqdm(as_completed(futs), total=len(futs), unit="file"):
|
| 173 |
status, rel = f.result()
|
| 174 |
if status.startswith("error:"):
|
| 175 |
counts["error"] += 1
|
|
|
|
| 178 |
counts[status] += 1
|
| 179 |
|
| 180 |
dst.mkdir(parents=True, exist_ok=True)
|
| 181 |
+
total = len(img_jobs) + len(other_jobs)
|
| 182 |
out_bytes = sum(p.stat().st_size for p in dst.rglob("*") if p.is_file())
|
| 183 |
(dst / "_manifest.json").write_text(json.dumps({
|
| 184 |
"source": str(src), "target": target,
|
| 185 |
"mode": "square" if square else "shortest_edge",
|
| 186 |
"jpeg_quality": quality, "subsampling": "4:4:4",
|
| 187 |
"resampling": "BICUBIC" if square else "LANCZOS",
|
| 188 |
+
"counts": counts, "total": total,
|
| 189 |
+
"images": len(img_jobs), "non_image": len(other_jobs),
|
| 190 |
"output_bytes": out_bytes,
|
| 191 |
"built_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
| 192 |
}, indent=2), encoding="utf-8")
|
|
|
|
| 195 |
print(f"[resize] WARNING: {len(errors)} failures -> {dst/'_errors.txt'}")
|
| 196 |
print(f"[resize] done: {counts}")
|
| 197 |
print(f"[resize] output size: {out_bytes / 1024**3:.2f} GB "
|
| 198 |
+
f"({out_bytes / max(1, len(img_jobs)) / 1024:.0f} KB/image avg)")
|
| 199 |
|
| 200 |
|
| 201 |
# -- Phase 2: pack into tar shards -------------------------------------------
|
scripts/resize_and_shard.ipynb
CHANGED
|
@@ -46,7 +46,7 @@
|
|
| 46 |
"execution_count": null,
|
| 47 |
"metadata": {},
|
| 48 |
"outputs": [],
|
| 49 |
-
"source": "import os, json, shutil, tarfile, time\nfrom pathlib import Path\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom PIL import Image\nfrom tqdm.auto import tqdm\n\nImage.MAX_IMAGE_PIXELS = None # don't abort on large medical images\nIMG_EXTS = (\".jpg\", \".jpeg\", \".png\")\n\n\ndef _resize_one(src_path, dst_path, target, quality, square):\n \"\"\"Returns one of: resized | squared | copied | skipped | error:<msg>.\"\"\"\n try:\n dst_path = Path(dst_path)\n if dst_path.exists() and dst_path.stat().st_size > 0:\n return \"skipped\" # resumable\n dst_path.parent.mkdir(parents=True, exist_ok=True)\n with Image.open(src_path) as im:\n w, h = im.size\n shorter = min(w, h)\n # Non-square: if shorter side already <= target, downscaling would\n # push it below 518 -> copy verbatim (lossless, never worsens a\n # low-res source). Square mode must always emit exactly target^2.\n if not square and shorter <= target:\n shutil.copy2(src_path, dst_path)\n return \"copied\"\n if im.mode not in (\"L\", \"RGB\"):\n im = im.convert(\"RGB\")\n # shorter axis EXACTLY = target; longer scales proportionally\n if w <= h:\n new_size = (target, round(h * target / w))\n else:\n new_size = (round(w * target / h), target)\n # square mode reproduces the processor exactly -> bicubic\n im = im.resize(new_size, Image.BICUBIC if square else Image.LANCZOS)\n if square:\n W, H = im.size\n left, top = (W - target) // 2, (H - target) // 2\n im = im.crop((left, top, left + target, top + target))\n im.save(dst_path, \"JPEG\", quality=quality, optimize=True, subsampling=0)\n return \"squared\" if square else \"resized\"\n except Exception as e:\n return f\"error:{type(e).__name__}: {e}\"\n\n\ndef resize_tree(src: Path, dst: Path, target, quality, workers, square):\n print(f\"[resize] scanning {src} ...\")\n
|
| 50 |
},
|
| 51 |
{
|
| 52 |
"cell_type": "markdown",
|
|
@@ -137,4 +137,4 @@
|
|
| 137 |
},
|
| 138 |
"nbformat": 4,
|
| 139 |
"nbformat_minor": 5
|
| 140 |
-
}
|
|
|
|
| 46 |
"execution_count": null,
|
| 47 |
"metadata": {},
|
| 48 |
"outputs": [],
|
| 49 |
+
"source": "import os, json, shutil, tarfile, time\nfrom pathlib import Path\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom PIL import Image\nfrom tqdm.auto import tqdm\n\nImage.MAX_IMAGE_PIXELS = None # don't abort on large medical images\nIMG_EXTS = (\".jpg\", \".jpeg\", \".png\")\n\n\ndef _resize_one(src_path, dst_path, target, quality, square):\n \"\"\"Returns one of: resized | squared | copied | skipped | error:<msg>.\"\"\"\n try:\n dst_path = Path(dst_path)\n if dst_path.exists() and dst_path.stat().st_size > 0:\n return \"skipped\" # resumable\n dst_path.parent.mkdir(parents=True, exist_ok=True)\n with Image.open(src_path) as im:\n w, h = im.size\n shorter = min(w, h)\n # Non-square: if shorter side already <= target, downscaling would\n # push it below 518 -> copy verbatim (lossless, never worsens a\n # low-res source). Square mode must always emit exactly target^2.\n if not square and shorter <= target:\n shutil.copy2(src_path, dst_path)\n return \"copied\"\n if im.mode not in (\"L\", \"RGB\"):\n im = im.convert(\"RGB\")\n # shorter axis EXACTLY = target; longer scales proportionally\n if w <= h:\n new_size = (target, round(h * target / w))\n else:\n new_size = (round(w * target / h), target)\n # square mode reproduces the processor exactly -> bicubic\n im = im.resize(new_size, Image.BICUBIC if square else Image.LANCZOS)\n if square:\n W, H = im.size\n left, top = (W - target) // 2, (H - target) // 2\n im = im.crop((left, top, left + target, top + target))\n im.save(dst_path, \"JPEG\", quality=quality, optimize=True, subsampling=0)\n return \"squared\" if square else \"resized\"\n except Exception as e:\n return f\"error:{type(e).__name__}: {e}\"\n\n\ndef _copy_one(src_path, dst_path):\n \"\"\"Copy non-image files (reports .txt, chexpert .csv, metadata .json, ...)\n verbatim so the shipped tree mirrors MIMIC-CXR_processed exactly.\"\"\"\n try:\n dst_path = Path(dst_path)\n if dst_path.exists() and dst_path.stat().st_size > 0:\n return \"skipped\"\n dst_path.parent.mkdir(parents=True, exist_ok=True)\n shutil.copy2(src_path, dst_path)\n return \"copied_other\"\n except Exception as e:\n return f\"error:{type(e).__name__}: {e}\"\n\n\ndef resize_tree(src: Path, dst: Path, target, quality, workers, square):\n print(f\"[resize] scanning {src} ...\")\n img_jobs, other_jobs = [], []\n for root, _, files in os.walk(src):\n for fn in files:\n sp = Path(root) / fn\n rel = sp.relative_to(src)\n dp = dst / rel\n if fn.lower().endswith(IMG_EXTS):\n img_jobs.append((str(sp), str(dp)))\n else:\n other_jobs.append((str(sp), str(dp)))\n if not img_jobs and not other_jobs:\n raise SystemExit(f\"ERROR: nothing found under {src}\")\n mode = f\"square {target}x{target}\" if square else f\"shortest-edge {target}px\"\n print(f\"[resize] {len(img_jobs):,} images + {len(other_jobs):,} non-image \"\n f\"-> {dst} ({mode}, q{quality}, {workers} threads)\")\n\n counts = {\"resized\": 0, \"squared\": 0, \"copied\": 0,\n \"copied_other\": 0, \"skipped\": 0, \"error\": 0}\n errors = []\n with ThreadPoolExecutor(max_workers=workers) as ex:\n futs = {}\n for s, d in img_jobs:\n futs[ex.submit(_resize_one, s, d, target, quality, square)] = d\n for s, d in other_jobs:\n futs[ex.submit(_copy_one, s, d)] = d\n for f in tqdm(as_completed(futs), total=len(futs), unit=\"file\"):\n st = f.result()\n if st.startswith(\"error:\"):\n counts[\"error\"] += 1\n errors.append(f\"{futs[f]}\\t{st}\")\n else:\n counts[st] += 1\n\n dst.mkdir(parents=True, exist_ok=True)\n total = len(img_jobs) + len(other_jobs)\n out_bytes = sum(p.stat().st_size for p in dst.rglob(\"*\") if p.is_file())\n (dst / \"_manifest.json\").write_text(json.dumps({\n \"source\": str(src), \"target\": target,\n \"mode\": \"square\" if square else \"shortest_edge\",\n \"jpeg_quality\": quality, \"subsampling\": \"4:4:4\",\n \"resampling\": \"BICUBIC\" if square else \"LANCZOS\",\n \"counts\": counts, \"total\": total,\n \"images\": len(img_jobs), \"non_image\": len(other_jobs),\n \"output_bytes\": out_bytes,\n \"built_at\": time.strftime(\"%Y-%m-%dT%H:%M:%S\"),\n }, indent=2), encoding=\"utf-8\")\n if errors:\n (dst / \"_errors.txt\").write_text(\"\\n\".join(errors), encoding=\"utf-8\")\n print(f\"[resize] WARNING: {len(errors)} failures -> {dst/'_errors.txt'}\")\n print(f\"[resize] done: {counts}\")\n print(f\"[resize] output size: {out_bytes/1024**3:.2f} GB \"\n f\"({out_bytes/max(1,len(img_jobs))/1024:.0f} KB/image avg)\")\n\n\ndef pack_shards(dst: Path, shards_dir: Path, shard_gb, prefix=\"cxr\"):\n shard_bytes = int(shard_gb * 1024**3)\n shards_dir.mkdir(parents=True, exist_ok=True)\n files = sorted(p for p in dst.rglob(\"*\")\n if p.is_file() and p.name not in (\"_manifest.json\", \"_errors.txt\"))\n if not files:\n raise SystemExit(f\"ERROR: nothing to pack under {dst}\")\n print(f\"[pack] {len(files):,} files -> tar shards (~{shard_gb} GB each)\")\n written, idx, cur = [], 0, 0\n\n def _open(i):\n path = shards_dir / f\"{prefix}-{i:04d}.tar\"\n written.append(path)\n return tarfile.open(path, \"w\")\n\n tar = _open(0)\n for fp in tqdm(files, unit=\"file\"):\n if cur >= shard_bytes:\n tar.close(); idx += 1; tar = _open(idx); cur = 0\n tar.add(fp, arcname=str(fp.relative_to(dst))) # rel path -> tree rebuilt on extract\n cur += fp.stat().st_size\n tar.close()\n man = dst / \"_manifest.json\"\n if man.exists():\n shutil.copy2(man, shards_dir / \"_manifest.json\")\n (shards_dir / \"SHARDS.txt\").write_text(\"\\n\".join(p.name for p in written), encoding=\"utf-8\")\n print(f\"[pack] wrote {len(written)} shards -> {shards_dir}\")\n return written\n\nprint(\"functions ready\")"
|
| 50 |
},
|
| 51 |
{
|
| 52 |
"cell_type": "markdown",
|
|
|
|
| 137 |
},
|
| 138 |
"nbformat": 4,
|
| 139 |
"nbformat_minor": 5
|
| 140 |
+
}
|