convitom Claude Opus 4.7 commited on
Commit
0a99045
·
1 Parent(s): 8b5bef1

fix(scripts): carry non-image files through resize + tar shards

Browse files

The IMG_EXTS filter silently dropped reports (.txt), CheXpert labels
(.csv), and any other metadata interleaved in the source tree, so the
shipped MIMIC-CXR_resized would lose them. Add a parallel _copy_one
worker that copies non-image files verbatim, preserving the exact tree.
Manifest now records both counts. Applies to both build_resized_dataset.py
and resize_and_shard.ipynb (cell c06).

This makes the resized release a faithful mirror of MIMIC-CXR_processed
regardless of what's mixed in -- needed if the instruct JSON builder is
ever run on the training box (auto-build path reads .txt + chexpert.csv).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

scripts/build_resized_dataset.py CHANGED
@@ -123,27 +123,53 @@ def _resize_one(args) -> tuple[str, str]:
123
  return f"error:{type(e).__name__}: {e}", rel
124
 
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def resize_tree(src: Path, dst: Path, target: int, quality: int,
127
  workers: int, square: bool) -> None:
128
  print(f"[resize] scanning {src} ...")
129
- jobs = []
130
  for root, _, files in os.walk(src):
131
  for fn in files:
 
 
 
132
  if fn.lower().endswith(IMG_EXTS):
133
- sp = Path(root) / fn
134
- rel = sp.relative_to(src)
135
- jobs.append((str(sp), str(dst / rel), str(rel), target, quality, square))
136
- if not jobs:
137
- sys.exit(f"ERROR: no images ({IMG_EXTS}) found under {src}")
 
 
138
  mode = f"square {target}x{target}" if square else f"shortest-edge {target}px"
139
- print(f"[resize] {len(jobs):,} images -> {dst} "
140
- f"({mode}, q{quality}, {workers} workers)")
141
 
142
- counts = {"resized": 0, "squared": 0, "copied": 0, "skipped": 0, "error": 0}
 
143
  errors: list[str] = []
144
  with ProcessPoolExecutor(max_workers=workers) as ex:
145
- futs = [ex.submit(_resize_one, j) for j in jobs]
146
- for f in tqdm(as_completed(futs), total=len(futs), unit="img"):
 
147
  status, rel = f.result()
148
  if status.startswith("error:"):
149
  counts["error"] += 1
@@ -152,13 +178,15 @@ def resize_tree(src: Path, dst: Path, target: int, quality: int,
152
  counts[status] += 1
153
 
154
  dst.mkdir(parents=True, exist_ok=True)
 
155
  out_bytes = sum(p.stat().st_size for p in dst.rglob("*") if p.is_file())
156
  (dst / "_manifest.json").write_text(json.dumps({
157
  "source": str(src), "target": target,
158
  "mode": "square" if square else "shortest_edge",
159
  "jpeg_quality": quality, "subsampling": "4:4:4",
160
  "resampling": "BICUBIC" if square else "LANCZOS",
161
- "counts": counts, "total": len(jobs),
 
162
  "output_bytes": out_bytes,
163
  "built_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
164
  }, indent=2), encoding="utf-8")
@@ -167,7 +195,7 @@ def resize_tree(src: Path, dst: Path, target: int, quality: int,
167
  print(f"[resize] WARNING: {len(errors)} failures -> {dst/'_errors.txt'}")
168
  print(f"[resize] done: {counts}")
169
  print(f"[resize] output size: {out_bytes / 1024**3:.2f} GB "
170
- f"({out_bytes / max(1, len(jobs)) / 1024:.0f} KB/image avg)")
171
 
172
 
173
  # -- Phase 2: pack into tar shards -------------------------------------------
 
123
  return f"error:{type(e).__name__}: {e}", rel
124
 
125
 
126
+ def _copy_one(args) -> tuple[str, str]:
127
+ """Worker: copy a non-image file verbatim, preserving the tree.
128
+
129
+ Used for reports (.txt), CheXpert labels (.csv), metadata (.json) and
130
+ anything else interleaved in the source tree -- so the tar shards carry
131
+ a complete copy of MIMIC-CXR_processed, not just images.
132
+ """
133
+ src_path, dst_path, rel = args
134
+ try:
135
+ dst_path = Path(dst_path)
136
+ if dst_path.exists() and dst_path.stat().st_size > 0:
137
+ return "skipped", rel
138
+ dst_path.parent.mkdir(parents=True, exist_ok=True)
139
+ shutil.copy2(src_path, dst_path)
140
+ return "copied_other", rel
141
+ except Exception as e:
142
+ return f"error:{type(e).__name__}: {e}", rel
143
+
144
+
145
  def resize_tree(src: Path, dst: Path, target: int, quality: int,
146
  workers: int, square: bool) -> None:
147
  print(f"[resize] scanning {src} ...")
148
+ img_jobs, other_jobs = [], []
149
  for root, _, files in os.walk(src):
150
  for fn in files:
151
+ sp = Path(root) / fn
152
+ rel = sp.relative_to(src)
153
+ dp = dst / rel
154
  if fn.lower().endswith(IMG_EXTS):
155
+ img_jobs.append((str(sp), str(dp), str(rel), target, quality, square))
156
+ else:
157
+ # non-image: reports/csv/json/etc. copied verbatim so the
158
+ # shipped tree mirrors the source exactly (no data loss).
159
+ other_jobs.append((str(sp), str(dp), str(rel)))
160
+ if not img_jobs and not other_jobs:
161
+ sys.exit(f"ERROR: nothing found under {src}")
162
  mode = f"square {target}x{target}" if square else f"shortest-edge {target}px"
163
+ print(f"[resize] {len(img_jobs):,} images + {len(other_jobs):,} non-image "
164
+ f"-> {dst} ({mode}, q{quality}, {workers} workers)")
165
 
166
+ counts = {"resized": 0, "squared": 0, "copied": 0,
167
+ "copied_other": 0, "skipped": 0, "error": 0}
168
  errors: list[str] = []
169
  with ProcessPoolExecutor(max_workers=workers) as ex:
170
+ futs = [ex.submit(_resize_one, j) for j in img_jobs]
171
+ futs += [ex.submit(_copy_one, j) for j in other_jobs]
172
+ for f in tqdm(as_completed(futs), total=len(futs), unit="file"):
173
  status, rel = f.result()
174
  if status.startswith("error:"):
175
  counts["error"] += 1
 
178
  counts[status] += 1
179
 
180
  dst.mkdir(parents=True, exist_ok=True)
181
+ total = len(img_jobs) + len(other_jobs)
182
  out_bytes = sum(p.stat().st_size for p in dst.rglob("*") if p.is_file())
183
  (dst / "_manifest.json").write_text(json.dumps({
184
  "source": str(src), "target": target,
185
  "mode": "square" if square else "shortest_edge",
186
  "jpeg_quality": quality, "subsampling": "4:4:4",
187
  "resampling": "BICUBIC" if square else "LANCZOS",
188
+ "counts": counts, "total": total,
189
+ "images": len(img_jobs), "non_image": len(other_jobs),
190
  "output_bytes": out_bytes,
191
  "built_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
192
  }, indent=2), encoding="utf-8")
 
195
  print(f"[resize] WARNING: {len(errors)} failures -> {dst/'_errors.txt'}")
196
  print(f"[resize] done: {counts}")
197
  print(f"[resize] output size: {out_bytes / 1024**3:.2f} GB "
198
+ f"({out_bytes / max(1, len(img_jobs)) / 1024:.0f} KB/image avg)")
199
 
200
 
201
  # -- Phase 2: pack into tar shards -------------------------------------------
scripts/resize_and_shard.ipynb CHANGED
@@ -46,7 +46,7 @@
46
  "execution_count": null,
47
  "metadata": {},
48
  "outputs": [],
49
- "source": "import os, json, shutil, tarfile, time\nfrom pathlib import Path\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom PIL import Image\nfrom tqdm.auto import tqdm\n\nImage.MAX_IMAGE_PIXELS = None # don't abort on large medical images\nIMG_EXTS = (\".jpg\", \".jpeg\", \".png\")\n\n\ndef _resize_one(src_path, dst_path, target, quality, square):\n \"\"\"Returns one of: resized | squared | copied | skipped | error:<msg>.\"\"\"\n try:\n dst_path = Path(dst_path)\n if dst_path.exists() and dst_path.stat().st_size > 0:\n return \"skipped\" # resumable\n dst_path.parent.mkdir(parents=True, exist_ok=True)\n with Image.open(src_path) as im:\n w, h = im.size\n shorter = min(w, h)\n # Non-square: if shorter side already <= target, downscaling would\n # push it below 518 -> copy verbatim (lossless, never worsens a\n # low-res source). Square mode must always emit exactly target^2.\n if not square and shorter <= target:\n shutil.copy2(src_path, dst_path)\n return \"copied\"\n if im.mode not in (\"L\", \"RGB\"):\n im = im.convert(\"RGB\")\n # shorter axis EXACTLY = target; longer scales proportionally\n if w <= h:\n new_size = (target, round(h * target / w))\n else:\n new_size = (round(w * target / h), target)\n # square mode reproduces the processor exactly -> bicubic\n im = im.resize(new_size, Image.BICUBIC if square else Image.LANCZOS)\n if square:\n W, H = im.size\n left, top = (W - target) // 2, (H - target) // 2\n im = im.crop((left, top, left + target, top + target))\n im.save(dst_path, \"JPEG\", quality=quality, optimize=True, subsampling=0)\n return \"squared\" if square else \"resized\"\n except Exception as e:\n return f\"error:{type(e).__name__}: {e}\"\n\n\ndef resize_tree(src: Path, dst: Path, target, quality, workers, square):\n print(f\"[resize] scanning {src} ...\")\n jobs = []\n for root, _, files in os.walk(src):\n for fn in files:\n if fn.lower().endswith(IMG_EXTS):\n sp = Path(root) / fn\n rel = sp.relative_to(src)\n jobs.append((str(sp), str(dst / rel)))\n if not jobs:\n raise SystemExit(f\"ERROR: no images under {src}\")\n mode = f\"square {target}x{target}\" if square else f\"shortest-edge {target}px\"\n print(f\"[resize] {len(jobs):,} images -> {dst} ({mode}, q{quality}, {workers} threads)\")\n\n counts = {\"resized\": 0, \"squared\": 0, \"copied\": 0, \"skipped\": 0, \"error\": 0}\n errors = []\n with ThreadPoolExecutor(max_workers=workers) as ex:\n futs = {ex.submit(_resize_one, s, d, target, quality, square): d\n for s, d in jobs}\n for f in tqdm(as_completed(futs), total=len(futs), unit=\"img\"):\n st = f.result()\n if st.startswith(\"error:\"):\n counts[\"error\"] += 1\n errors.append(f\"{futs[f]}\\t{st}\")\n else:\n counts[st] += 1\n\n dst.mkdir(parents=True, exist_ok=True)\n out_bytes = sum(p.stat().st_size for p in dst.rglob(\"*\") if p.is_file())\n (dst / \"_manifest.json\").write_text(json.dumps({\n \"source\": str(src), \"target\": target,\n \"mode\": \"square\" if square else \"shortest_edge\",\n \"jpeg_quality\": quality, \"subsampling\": \"4:4:4\",\n \"resampling\": \"BICUBIC\" if square else \"LANCZOS\",\n \"counts\": counts, \"total\": len(jobs), \"output_bytes\": out_bytes,\n \"built_at\": time.strftime(\"%Y-%m-%dT%H:%M:%S\"),\n }, indent=2), encoding=\"utf-8\")\n if errors:\n (dst / \"_errors.txt\").write_text(\"\\n\".join(errors), encoding=\"utf-8\")\n print(f\"[resize] WARNING: {len(errors)} failures -> {dst/'_errors.txt'}\")\n print(f\"[resize] done: {counts}\")\n print(f\"[resize] output size: {out_bytes/1024**3:.2f} GB \"\n f\"({out_bytes/max(1,len(jobs))/1024:.0f} KB/image avg)\")\n\n\ndef pack_shards(dst: Path, shards_dir: Path, shard_gb, prefix=\"cxr\"):\n shard_bytes = int(shard_gb * 1024**3)\n shards_dir.mkdir(parents=True, exist_ok=True)\n files = sorted(p for p in dst.rglob(\"*\")\n if p.is_file() and p.name not in (\"_manifest.json\", \"_errors.txt\"))\n if not files:\n raise SystemExit(f\"ERROR: nothing to pack under {dst}\")\n print(f\"[pack] {len(files):,} files -> tar shards (~{shard_gb} GB each)\")\n written, idx, cur = [], 0, 0\n\n def _open(i):\n path = shards_dir / f\"{prefix}-{i:04d}.tar\"\n written.append(path)\n return tarfile.open(path, \"w\")\n\n tar = _open(0)\n for fp in tqdm(files, unit=\"file\"):\n if cur >= shard_bytes:\n tar.close(); idx += 1; tar = _open(idx); cur = 0\n tar.add(fp, arcname=str(fp.relative_to(dst))) # rel path -> tree rebuilt on extract\n cur += fp.stat().st_size\n tar.close()\n man = dst / \"_manifest.json\"\n if man.exists():\n shutil.copy2(man, shards_dir / \"_manifest.json\")\n (shards_dir / \"SHARDS.txt\").write_text(\"\\n\".join(p.name for p in written), encoding=\"utf-8\")\n print(f\"[pack] wrote {len(written)} shards -> {shards_dir}\")\n return written\n\nprint(\"functions ready\")"
50
  },
51
  {
52
  "cell_type": "markdown",
@@ -137,4 +137,4 @@
137
  },
138
  "nbformat": 4,
139
  "nbformat_minor": 5
140
- }
 
46
  "execution_count": null,
47
  "metadata": {},
48
  "outputs": [],
49
+ "source": "import os, json, shutil, tarfile, time\nfrom pathlib import Path\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom PIL import Image\nfrom tqdm.auto import tqdm\n\nImage.MAX_IMAGE_PIXELS = None # don't abort on large medical images\nIMG_EXTS = (\".jpg\", \".jpeg\", \".png\")\n\n\ndef _resize_one(src_path, dst_path, target, quality, square):\n \"\"\"Returns one of: resized | squared | copied | skipped | error:<msg>.\"\"\"\n try:\n dst_path = Path(dst_path)\n if dst_path.exists() and dst_path.stat().st_size > 0:\n return \"skipped\" # resumable\n dst_path.parent.mkdir(parents=True, exist_ok=True)\n with Image.open(src_path) as im:\n w, h = im.size\n shorter = min(w, h)\n # Non-square: if shorter side already <= target, downscaling would\n # push it below 518 -> copy verbatim (lossless, never worsens a\n # low-res source). Square mode must always emit exactly target^2.\n if not square and shorter <= target:\n shutil.copy2(src_path, dst_path)\n return \"copied\"\n if im.mode not in (\"L\", \"RGB\"):\n im = im.convert(\"RGB\")\n # shorter axis EXACTLY = target; longer scales proportionally\n if w <= h:\n new_size = (target, round(h * target / w))\n else:\n new_size = (round(w * target / h), target)\n # square mode reproduces the processor exactly -> bicubic\n im = im.resize(new_size, Image.BICUBIC if square else Image.LANCZOS)\n if square:\n W, H = im.size\n left, top = (W - target) // 2, (H - target) // 2\n im = im.crop((left, top, left + target, top + target))\n im.save(dst_path, \"JPEG\", quality=quality, optimize=True, subsampling=0)\n return \"squared\" if square else \"resized\"\n except Exception as e:\n return f\"error:{type(e).__name__}: {e}\"\n\n\ndef _copy_one(src_path, dst_path):\n \"\"\"Copy non-image files (reports .txt, chexpert .csv, metadata .json, ...)\n verbatim so the shipped tree mirrors MIMIC-CXR_processed exactly.\"\"\"\n try:\n dst_path = Path(dst_path)\n if dst_path.exists() and dst_path.stat().st_size > 0:\n return \"skipped\"\n dst_path.parent.mkdir(parents=True, exist_ok=True)\n shutil.copy2(src_path, dst_path)\n return \"copied_other\"\n except Exception as e:\n return f\"error:{type(e).__name__}: {e}\"\n\n\ndef resize_tree(src: Path, dst: Path, target, quality, workers, square):\n print(f\"[resize] scanning {src} ...\")\n img_jobs, other_jobs = [], []\n for root, _, files in os.walk(src):\n for fn in files:\n sp = Path(root) / fn\n rel = sp.relative_to(src)\n dp = dst / rel\n if fn.lower().endswith(IMG_EXTS):\n img_jobs.append((str(sp), str(dp)))\n else:\n other_jobs.append((str(sp), str(dp)))\n if not img_jobs and not other_jobs:\n raise SystemExit(f\"ERROR: nothing found under {src}\")\n mode = f\"square {target}x{target}\" if square else f\"shortest-edge {target}px\"\n print(f\"[resize] {len(img_jobs):,} images + {len(other_jobs):,} non-image \"\n f\"-> {dst} ({mode}, q{quality}, {workers} threads)\")\n\n counts = {\"resized\": 0, \"squared\": 0, \"copied\": 0,\n \"copied_other\": 0, \"skipped\": 0, \"error\": 0}\n errors = []\n with ThreadPoolExecutor(max_workers=workers) as ex:\n futs = {}\n for s, d in img_jobs:\n futs[ex.submit(_resize_one, s, d, target, quality, square)] = d\n for s, d in other_jobs:\n futs[ex.submit(_copy_one, s, d)] = d\n for f in tqdm(as_completed(futs), total=len(futs), unit=\"file\"):\n st = f.result()\n if st.startswith(\"error:\"):\n counts[\"error\"] += 1\n errors.append(f\"{futs[f]}\\t{st}\")\n else:\n counts[st] += 1\n\n dst.mkdir(parents=True, exist_ok=True)\n total = len(img_jobs) + len(other_jobs)\n out_bytes = sum(p.stat().st_size for p in dst.rglob(\"*\") if p.is_file())\n (dst / \"_manifest.json\").write_text(json.dumps({\n \"source\": str(src), \"target\": target,\n \"mode\": \"square\" if square else \"shortest_edge\",\n \"jpeg_quality\": quality, \"subsampling\": \"4:4:4\",\n \"resampling\": \"BICUBIC\" if square else \"LANCZOS\",\n \"counts\": counts, \"total\": total,\n \"images\": len(img_jobs), \"non_image\": len(other_jobs),\n \"output_bytes\": out_bytes,\n \"built_at\": time.strftime(\"%Y-%m-%dT%H:%M:%S\"),\n }, indent=2), encoding=\"utf-8\")\n if errors:\n (dst / \"_errors.txt\").write_text(\"\\n\".join(errors), encoding=\"utf-8\")\n print(f\"[resize] WARNING: {len(errors)} failures -> {dst/'_errors.txt'}\")\n print(f\"[resize] done: {counts}\")\n print(f\"[resize] output size: {out_bytes/1024**3:.2f} GB \"\n f\"({out_bytes/max(1,len(img_jobs))/1024:.0f} KB/image avg)\")\n\n\ndef pack_shards(dst: Path, shards_dir: Path, shard_gb, prefix=\"cxr\"):\n shard_bytes = int(shard_gb * 1024**3)\n shards_dir.mkdir(parents=True, exist_ok=True)\n files = sorted(p for p in dst.rglob(\"*\")\n if p.is_file() and p.name not in (\"_manifest.json\", \"_errors.txt\"))\n if not files:\n raise SystemExit(f\"ERROR: nothing to pack under {dst}\")\n print(f\"[pack] {len(files):,} files -> tar shards (~{shard_gb} GB each)\")\n written, idx, cur = [], 0, 0\n\n def _open(i):\n path = shards_dir / f\"{prefix}-{i:04d}.tar\"\n written.append(path)\n return tarfile.open(path, \"w\")\n\n tar = _open(0)\n for fp in tqdm(files, unit=\"file\"):\n if cur >= shard_bytes:\n tar.close(); idx += 1; tar = _open(idx); cur = 0\n tar.add(fp, arcname=str(fp.relative_to(dst))) # rel path -> tree rebuilt on extract\n cur += fp.stat().st_size\n tar.close()\n man = dst / \"_manifest.json\"\n if man.exists():\n shutil.copy2(man, shards_dir / \"_manifest.json\")\n (shards_dir / \"SHARDS.txt\").write_text(\"\\n\".join(p.name for p in written), encoding=\"utf-8\")\n print(f\"[pack] wrote {len(written)} shards -> {shards_dir}\")\n return written\n\nprint(\"functions ready\")"
50
  },
51
  {
52
  "cell_type": "markdown",
 
137
  },
138
  "nbformat": 4,
139
  "nbformat_minor": 5
140
+ }