(devops) automatic pull and preprocess of datasets

Browse files

Files changed (11) hide show

.windsurf/rules/executing-python-files.md +5 -0
gdrive_pull.py +0 -75
requirements.txt +1 -0
scripts/pull_and_preprocess_wireseghr_dataset.py +290 -0
scripts/pull_ttpla.sh +0 -0
scripts/setup_script.sh +136 -0
scripts/ttpla_to_masks.py +14 -0
src/wireseghr/data/dataset.py +2 -2
src/wireseghr/data/ttpla_to_masks.py +116 -0
tests/test_ttpla_to_masks.py +59 -0
train.py +6 -6

.windsurf/rules/executing-python-files.md ADDED Viewed

	@@ -0,0 +1,5 @@

+---
+trigger: always_on
+---
+When executing python files, use python3 instead of python because that adheres to project's venv. Additionally, if you haven't activated venv yet, you have to activate it or else the execution will fail with module not found exception.

gdrive_pull.py DELETED Viewed

@@ -1,75 +0,0 @@
-import os
-import argparse
-from pydrive2.auth import GoogleAuth
-from pydrive2.drive import GoogleDrive
-from tqdm import tqdm
-from pathlib import Path
-def authenticate(service_account_json):
-    """Authenticate PyDrive2 with a service account."""
-    gauth = GoogleAuth()
-    # Configure PyDrive2 to use service account credentials directly
-    gauth.settings["client_config_backend"] = "service"
-    gauth.settings["service_config"] = {
-        "client_json_file_path": service_account_json,
-        # Provide the key to satisfy PyDrive2 even if not impersonating
-        "client_user_email": "drive-bot@web-design-396514.iam.gserviceaccount.com",
-    }
-    gauth.ServiceAuth()
-    drive = GoogleDrive(gauth)
-    return drive
-def list_files_with_paths(drive, folder_id, prefix=""):
-    """Recursively collect all files with their relative paths from a folder."""
-    items = []
-    query = f"'{folder_id}' in parents and trashed=false"
-    for file in drive.ListFile({"q": query, "maxResults": 1000}).GetList():
-        if file["mimeType"] == "application/vnd.google-apps.folder":
-            sub_prefix = (
-                os.path.join(prefix, file["title"]) if prefix else file["title"]
-            )
-            items += list_files_with_paths(drive, file["id"], sub_prefix)
-        else:
-            rel_path = os.path.join(prefix, file["title"]) if prefix else file["title"]
-            items.append((file, rel_path))
-    return items
-def download_folder(folder_id, dest, service_account_json):
-    drive = authenticate(service_account_json)
-    os.makedirs(dest, exist_ok=True)
-    print(f"Listing files in folder {folder_id}...")
-    files_with_paths = list_files_with_paths(drive, folder_id)
-    print(f"Found {len(files_with_paths)} files. Downloading...")
-    for file, rel_path in tqdm(files_with_paths, desc="Downloading", unit="file"):
-        out_path = os.path.join(dest, rel_path)
-        os.makedirs(os.path.dirname(out_path), exist_ok=True)
-        file.GetContentFile(out_path)
-def main():
-    parser = argparse.ArgumentParser(
-        description="Download a full Google Drive folder using a service account"
-    )
-    parser.add_argument("folder_id", help="Google Drive folder ID")
-    parser.add_argument("output_dir", help="Directory to save files")
-    parser.add_argument(
-        "--service-account",
-        default="service_account.json",
-        help="Path to your Google service account JSON key file",
-    )
-    args = parser.parse_args()
-    download_folder(args.folder_id, args.output_dir, args.service_account)
-if __name__ == "__main__":
-    # also, mkdir -p dataset/
-    path = Path("./dataset")
-    path.mkdir(exists_ok=True)
-    main()

requirements.txt CHANGED Viewed

@@ -8,3 +8,4 @@ Pillow>=9.5.0
 PyYAML>=6.0.1
 tqdm>=4.65.0
 gdown>=5.1.0

 PyYAML>=6.0.1
 tqdm>=4.65.0
 gdown>=5.1.0
+pydrive2

scripts/pull_and_preprocess_wireseghr_dataset.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import os
+import argparse
+import threading
+import random
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pydrive2.auth import GoogleAuth
+from pydrive2.drive import GoogleDrive
+from tqdm import tqdm
+from pathlib import Path
+thread_local = threading.local()
+def _get_thread_drive(service_account_json: str) -> GoogleDrive:
+    d = getattr(thread_local, "drive", None)
+    if d is None:
+        d = authenticate(service_account_json)
+        thread_local.drive = d
+    return d
+def authenticate(service_account_json):
+    """Authenticate PyDrive2 with a service account."""
+    gauth = GoogleAuth()
+    # Configure PyDrive2 to use service account credentials directly
+    gauth.settings["client_config_backend"] = "service"
+    gauth.settings["service_config"] = {
+        "client_json_file_path": service_account_json,
+        # Provide the key to satisfy PyDrive2 even if not impersonating
+        "client_user_email": "drive-bot@web-design-396514.iam.gserviceaccount.com",
+    }
+    gauth.ServiceAuth()
+    drive = GoogleDrive(gauth)
+    return drive
+def list_files_with_paths(drive, folder_id, prefix=""):
+    """Recursively collect all files with their relative paths from a folder."""
+    items = []
+    query = f"'{folder_id}' in parents and trashed=false"
+    params = {
+        "q": query,
+        "maxResults": 1000,
+        # Request only needed fields (Drive API v2 uses 'items')
+        "fields": "items(id,title,mimeType,fileSize,md5Checksum),nextPageToken",
+    }
+    for file in drive.ListFile(params).GetList():
+        if file["mimeType"] == "application/vnd.google-apps.folder":
+            sub_prefix = (
+                os.path.join(prefix, file["title"]) if prefix else file["title"]
+            )
+            items += list_files_with_paths(drive, file["id"], sub_prefix)
+        else:
+            rel_path = os.path.join(prefix, file["title"]) if prefix else file["title"]
+            size = int(file.get("fileSize", 0)) if "fileSize" in file else 0
+            items.append(
+                {
+                    "id": file["id"],
+                    "rel_path": rel_path,
+                    "size": size,
+                    "md5": file.get("md5Checksum", ""),
+                    "mimeType": file["mimeType"],
+                }
+            )
+    return items
+def download_folder(folder_id, dest, service_account_json, workers: int):
+    drive = authenticate(service_account_json)
+    os.makedirs(dest, exist_ok=True)
+    print(f"Listing files in folder {folder_id}...")
+    files_with_paths = list_files_with_paths(drive, folder_id)
+    total = len(files_with_paths)
+    print(f"Found {total} files. Planning downloads...")
+    # Prepare tasks and skip already downloaded files by size
+    tasks = []
+    skipped = 0
+    for meta in files_with_paths:
+        out_path = os.path.join(dest, meta["rel_path"])
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+        if meta["size"] > 0 and os.path.exists(out_path) and os.path.getsize(out_path) == meta["size"]:
+            skipped += 1
+            continue
+        tasks.append((meta["id"], out_path))
+    print(f"Skipping {skipped} existing files; {len(tasks)} to download.")
+    def _download_one(file_id: str, out_path: str):
+        d = _get_thread_drive(service_account_json)
+        f = d.CreateFile({"id": file_id})
+        f.GetContentFile(out_path)
+    if len(tasks) == 0:
+        print("All files are up to date.")
+        return
+    with ThreadPoolExecutor(max_workers=workers) as ex:
+        futures = [ex.submit(_download_one, fid, path) for fid, path in tasks]
+        for _ in tqdm(as_completed(futures), total=len(futures), desc="Downloading", unit="file"):
+            pass
+def pull(args=None):
+    parser = argparse.ArgumentParser(
+        description="Download a full Google Drive folder using a service account"
+    )
+    parser.add_argument(
+        "--folder-id",
+        dest="folder_id",
+        default="1fgy3wn_yuHEeMNbfiHNVl1-jEdYOfu6p",
+        help="Google Drive folder ID",
+    )
+    parser.add_argument(
+        "--output-dir",
+        dest="output_dir",
+        default="dataset/",
+        help="Directory to save files",
+    )
+    parser.add_argument(
+        "--service-account",
+        default="secrets/drive-json.json",
+        help="Path to your Google service account JSON key file",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=8,
+        help="Number of parallel download workers",
+    )
+    parsed = parser.parse_args(args=args)
+    download_folder(
+        parsed.folder_id, parsed.output_dir, parsed.service_account, parsed.workers
+    )
+def _index_numeric_pairs(images_dir: Path, masks_dir: Path):
+    assert images_dir.exists() and images_dir.is_dir(), f"Missing images_dir: {images_dir}"
+    assert masks_dir.exists() and masks_dir.is_dir(), f"Missing masks_dir: {masks_dir}"
+    img_files = sorted([p for p in images_dir.glob("*.jpg") if p.is_file()])
+    img_files += sorted([p for p in images_dir.glob("*.jpeg") if p.is_file()])
+    assert len(img_files) > 0, f"No .jpg/.jpeg images in {images_dir}"
+    ids = []
+    for p in img_files:
+        stem = p.stem
+        assert stem.isdigit(), f"Non-numeric filename encountered: {p.name}"
+        ids.append(int(stem))
+    ids = sorted(ids)
+    pairs = []
+    for i in ids:
+        ip_jpg = images_dir / f"{i}.jpg"
+        ip_jpeg = images_dir / f"{i}.jpeg"
+        ip = ip_jpg if ip_jpg.exists() else ip_jpeg
+        assert ip.exists(), f"Missing image for {i}: {ip_jpg} or {ip_jpeg}"
+        mp = masks_dir / f"{i}.png"
+        assert mp.exists(), f"Missing mask for {i}: {mp}"
+        pairs.append((ip, mp))
+    assert len(pairs) > 0, "No numeric pairs found"
+    return pairs
+def split_test_train_val(args=None):
+    parser = argparse.ArgumentParser(
+        description="Split dataset into train/val/test = 85/5/10 with numeric pairs"
+    )
+    parser.add_argument("--images-dir", required=True, help="Path to images directory")
+    parser.add_argument("--masks-dir", required=True, help="Path to masks directory")
+    parser.add_argument(
+        "--out-dir",
+        required=True,
+        help="Output root dir where train/ val/ test/ will be created",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument(
+        "--link-method",
+        choices=["symlink", "copy"],
+        default="symlink",
+        help="How to place files into splits",
+    )
+    parsed = parser.parse_args(args=args)
+    images_dir = Path(parsed.images_dir)
+    masks_dir = Path(parsed.masks_dir)
+    out_root = Path(parsed.out_dir)
+    pairs = _index_numeric_pairs(images_dir, masks_dir)
+    n = len(pairs)
+    n_train = int(0.85 * n)
+    n_val = int(0.05 * n)
+    rng = random.Random(parsed.seed)
+    idxs = list(range(n))
+    rng.shuffle(idxs)
+    train_idx = idxs[:n_train]
+    val_idx = idxs[n_train : n_train + n_val]
+    test_idx = idxs[n_train + n_val :]
+    def _ensure_dirs(root: Path):
+        (root / "images").mkdir(parents=True, exist_ok=True)
+        (root / "gts").mkdir(parents=True, exist_ok=True)
+    def _place(src: Path, dst: Path):
+        if parsed.link_method == "symlink":
+            try:
+                if dst.exists() or dst.is_symlink():
+                    dst.unlink()
+                os.symlink(src, dst)
+            except FileExistsError:
+                pass
+        else:  # copy
+            if dst.exists():
+                dst.unlink()
+            # use hardlink if possible to be fast and space efficient
+            try:
+                os.link(src, dst)
+            except OSError:
+                import shutil
+                shutil.copy2(src, dst)
+    for split_name, split_ids in (
+        ("train", train_idx),
+        ("val", val_idx),
+        ("test", test_idx),
+    ):
+        root = out_root / split_name
+        _ensure_dirs(root)
+        for k in split_ids:
+            img_p, mask_p = pairs[k]
+            (root / "images" / img_p.name).parent.mkdir(parents=True, exist_ok=True)
+            (root / "gts" / mask_p.name).parent.mkdir(parents=True, exist_ok=True)
+            _place(img_p, root / "images" / img_p.name)
+            _place(mask_p, root / "gts" / mask_p.name)
+    print(
+        f"Split written to {out_root} | train={len(train_idx)} val={len(val_idx)} test={len(test_idx)}"
+    )
+if __name__ == "__main__":
+    # also, mkdir -p dataset/
+    path = Path("./dataset")
+    path.mkdir(exist_ok=True)
+    # Subcommands
+    top = argparse.ArgumentParser(description="WireSegHR data utilities")
+    subs = top.add_subparsers(dest="cmd", required=True)
+    sp_pull = subs.add_parser("pull", help="Download dataset from Google Drive")
+    sp_pull.add_argument("--folder-id", dest="folder_id", default="1fgy3wn_yuHEeMNbfiHNVl1-jEdYOfu6p")
+    sp_pull.add_argument("--output-dir", dest="output_dir", default="dataset/")
+    sp_pull.add_argument("--service-account", default="secrets/drive-json.json")
+    sp_pull.add_argument("--workers", type=int, default=8)
+    sp_split = subs.add_parser(
+        "split_test_train_val", help="Create 85/5/10 train/val/test split"
+    )
+    sp_split.add_argument("--images-dir", required=True)
+    sp_split.add_argument("--masks-dir", required=True)
+    sp_split.add_argument("--out-dir", required=True)
+    sp_split.add_argument("--seed", type=int, default=42)
+    sp_split.add_argument(
+        "--link-method", choices=["symlink", "copy"], default="symlink"
+    )
+    ns = top.parse_args()
+    if ns.cmd == "pull":
+        pull([
+            "--folder-id",
+            ns.folder_id,
+            "--output-dir",
+            ns.output_dir,
+            "--service-account",
+            ns.service_account,
+            "--workers",
+            str(ns.workers),
+        ])
+    elif ns.cmd == "split_test_train_val":
+        split_test_train_val([
+            "--images-dir",
+            ns.images_dir,
+            "--masks-dir",
+            ns.masks_dir,
+            "--out-dir",
+            ns.out_dir,
+            "--seed",
+            str(ns.seed),
+            "--link-method",
+            ns.link_method,
+        ])

scripts/pull_ttpla.sh CHANGED Viewed

File without changes

scripts/setup_script.sh ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# This script downloads WireSegHR and TTPLA, converts TTPLA to masks, combines both,
+# and creates an 85/5/10 train/val/test split under dataset/.
+# 0) Setup env (includes gdown used by scripts/pull_ttpla.sh)
+pip install uv
+uv venv || true
+source .venv/bin/activate
+pip install uv
+uv pip install -r requirements.txt
+uv pip install gdown
+# 1) Pull WireSegHR dataset from Google Drive (default folder-id provided in script)
+#    This writes under dataset/wireseghr_raw/ (adjust if you want another dir)
+python3 scripts/pull_and_preprocess_wireseghr_dataset.py pull \
+  --output-dir dataset/wireseghr_raw
+# 2) Pull TTPLA dataset zip and unzip under dataset/ttpla_dataset/
+# Pass OUT_DIR explicitly to avoid nested dataset/dataset/ttpla_dataset
+bash scripts/pull_ttpla.sh "" "" ttpla_dataset
+# 3) Convert TTPLA JSON annotations to binary masks with numeric-only filenames
+#    Set these two to your actual TTPLA paths (after unzip).
+TTPLA_JSON_ROOT="dataset/ttpla_dataset"     # directory containing LabelMe-style JSONs (recursively)
+mkdir -p dataset/ttpla_flat/gts
+python3 scripts/ttpla_to_masks.py \
+  --input "$TTPLA_JSON_ROOT" \
+  --output dataset/ttpla_flat/gts \
+  --label cable
+# 4) Flatten TTPLA images to numeric-only stems to match the masks
+#    Set TTPLA_IMG_ROOT to the folder under which all TTPLA images can be found (recursively).
+TTPLA_IMG_ROOT="dataset/ttpla_dataset"      # directory where the images referenced by JSONs reside (recursively)
+mkdir -p dataset/ttpla_flat/images
+python3 - <<'PY'
+from pathlib import Path
+import json, os, shutil
+ttpla_json_root = Path("dataset/ttpla_dataset")
+img_root = Path(os.environ.get("TTPLA_IMG_ROOT","dataset/ttpla_dataset"))
+out_img = Path("dataset/ttpla_flat/images")
+out_img.mkdir(parents=True, exist_ok=True)
+jsons = sorted(ttpla_json_root.rglob("*.json"))
+assert len(jsons) > 0, f"No JSONs under {ttpla_json_root}"
+for jp in jsons:
+    data = json.loads(jp.read_text())
+    image_path = Path(data["imagePath"])  # e.g. "1_00186.jpg"
+    stem_raw = image_path.stem
+    num = "".join([c for c in stem_raw if c.isdigit()])
+    assert num.isdigit() and len(num) > 0, f"Non-numeric from {stem_raw}"
+    # locate the actual image file somewhere under img_root by filename
+    cands = list(img_root.rglob(image_path.name))
+    assert len(cands) == 1, f"Ambiguous or missing image for {image_path.name}: {cands}"
+    src = cands[0]
+    ext = src.suffix.lower()  # keep original .jpg/.jpeg
+    dst = out_img / f"{num}{ext}"
+    if dst.exists() or dst.is_symlink():
+        dst.unlink()
+    # Prefer hardlink for speed and space efficiency; fallback to copy
+    try:
+        os.link(src, dst)
+    except OSError:
+        shutil.copy2(src, dst)
+print(f"TTPLA flat images written to: {out_img}")
+PY
+# 5) Point to WireSegHR raw images/masks (adjust these to match what was downloaded in step 1)
+#    After the Drive pull, inspect to find these two folders:
+#    They must contain numeric-only image stems (.jpg/.jpeg) and PNG masks.
+#    Example placeholders below — update them to your actual locations:
+export WSHR_IMAGES="dataset/wireseghr_raw/images"
+export WSHR_MASKS="dataset/wireseghr_raw/gts"
+# 6) Build a combined pool (WireSegHR + TTPLA) and reindex to a single contiguous numeric ID space
+mkdir -p dataset/combined_pool_fix/images dataset/combined_pool_fix/gts
+python3 - <<'PY'
+import os
+from pathlib import Path
+def index_pairs(images_dir: Path, masks_dir: Path):
+    imgs = list(images_dir.glob("*.jpg")) + list(images_dir.glob("*.jpeg"))
+    pairs = {}
+    for ip in imgs:
+        assert ip.stem.isdigit(), f"Non-numeric image name: {ip.name}"
+        mp = masks_dir / f"{ip.stem}.png"
+        assert mp.exists(), f"Missing mask for {ip.stem}: {mp}"
+        pairs[int(ip.stem)] = (ip, mp)
+    return [pairs[k] for k in sorted(pairs.keys())]
+w_images = Path(os.environ["WSHR_IMAGES"])
+w_masks  = Path(os.environ["WSHR_MASKS"])
+t_images = Path("dataset/ttpla_flat/images")
+t_masks  = Path("dataset/ttpla_flat/gts")
+w_pairs = index_pairs(w_images, w_masks)
+t_pairs = index_pairs(t_images, t_masks)
+print("w_pairs:", len(w_pairs), "t_pairs:", len(t_pairs))
+all_pairs = w_pairs + t_pairs  # deterministic order: WireSegHR first, then TTPLA
+out_img = Path("dataset/combined_pool_fix/images")
+out_msk = Path("dataset/combined_pool_fix/gts")
+out_img.mkdir(parents=True, exist_ok=True)
+out_msk.mkdir(parents=True, exist_ok=True)
+# Reindex to 1..N, preserving each image's original extension
+i = 1
+for ip, mp in all_pairs:
+    ext = ip.suffix.lower()  # .jpg or .jpeg
+    dst_i = out_img / f"{i}{ext}"
+    dst_m = out_msk / f"{i}.png"
+    if dst_i.exists() or dst_i.is_symlink(): dst_i.unlink()
+    if dst_m.exists() or dst_m.is_symlink(): dst_m.unlink()
+    # Prefer hardlinks; fallback to copy if cross-device or unsupported
+    try:
+        os.link(ip, dst_i)
+    except OSError:
+        import shutil; shutil.copy2(ip, dst_i)
+    try:
+        os.link(mp, dst_m)
+    except OSError:
+        import shutil; shutil.copy2(mp, dst_m)
+    i += 1
+print(f"Combined pool: {i-1} pairs -> {out_img} and {out_msk}")
+PY
+# 7) Split the combined pool into train/val/test = 85/5/10
+python3 scripts/pull_and_preprocess_wireseghr_dataset.py split_test_train_val \
+  --images-dir dataset/combined_pool_fix/images \
+  --masks-dir dataset/combined_pool_fix/gts \
+  --out-dir dataset \
+  --seed 42 \
+  --link-method copy
+# Done. Your config at configs/default.yaml already points to dataset/train|val|test.

scripts/ttpla_to_masks.py ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/usr/bin/env python3
+import sys
+from pathlib import Path
+# Ensure local package under src/ is importable when running this script directly
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+SRC_PATH = PROJECT_ROOT / "src"
+if str(SRC_PATH) not in sys.path:
+    sys.path.insert(0, str(SRC_PATH))
+from wireseghr.data.ttpla_to_masks import main
+if __name__ == "__main__":
+    main()

src/wireseghr/data/dataset.py CHANGED Viewed

@@ -46,8 +46,8 @@ class WireSegDataset:
     def _index_pairs(self) -> List[Tuple[Path, Path]]:
         # Convention: numeric filenames; images are .jpg/.jpeg; masks (gts) are .png
-        img_files = sorted([p for p in self.images_dir.glob("*.jpg") if p.is_file()])
-        img_files += sorted([p for p in self.images_dir.glob("*.jpeg") if p.is_file()])
         assert len(img_files) > 0, f"No .jpg/.jpeg images in {self.images_dir}"
         pairs: List[Tuple[Path, Path]] = []
         ids: List[int] = []

     def _index_pairs(self) -> List[Tuple[Path, Path]]:
         # Convention: numeric filenames; images are .jpg/.jpeg; masks (gts) are .png
+        img_files = sorted([p for p in self.images_dir.glob("*.jpg") if p.exists()])
+        img_files += sorted([p for p in self.images_dir.glob("*.jpeg") if p.exists()])
         assert len(img_files) > 0, f"No .jpg/.jpeg images in {self.images_dir}"
         pairs: List[Tuple[Path, Path]] = []
         ids: List[int] = []

src/wireseghr/data/ttpla_to_masks.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Iterable, List
+from PIL import Image, ImageDraw
+import numpy as np
+def _rasterize_cable_mask(shapes: List[dict], height: int, width: int, label: str) -> np.ndarray:
+    """Rasterize polygons with given label into a binary mask of shape (H, W), values {0,255}.
+    Expects LabelMe-style annotations with shape entries containing keys:
+      - label: str
+      - shape_type: "polygon"
+      - points: [[x,y], ...]
+    """
+    assert height > 0 and width > 0
+    # PIL uses (W, H) for image size
+    mask_img = Image.new("L", (width, height), 0)
+    draw = ImageDraw.Draw(mask_img)
+    for s in shapes:
+        if s.get("label") != label:
+            continue
+        assert s.get("shape_type") == "polygon", "Only polygon shapes are supported"
+        pts = np.asarray(s.get("points"), dtype=np.float32)
+        assert pts.ndim == 2 and pts.shape[1] == 2, "Invalid points array"
+        # Round to nearest pixel and clip to image bounds
+        pts = np.rint(pts)
+        pts[:, 0] = np.clip(pts[:, 0], 0, width - 1)
+        pts[:, 1] = np.clip(pts[:, 1], 0, height - 1)
+        # PIL expects list of (x, y) tuples
+        pts_list = [ (int(p[0]), int(p[1])) for p in pts ]
+        draw.polygon(pts_list, outline=255, fill=255)
+    mask = np.asarray(mask_img, dtype=np.uint8)
+    return mask
+def _convert_one(json_path: Path, out_dir: Path, label: str) -> Path | None:
+    with open(json_path, "r") as f:
+        data = json.load(f)
+    shapes = data["shapes"]
+    H = int(data["imageHeight"])  # required by given JSON
+    W = int(data["imageWidth"])   # required by given JSON
+    image_path = Path(data["imagePath"])  # e.g. "1_00186.jpg"
+    # WireSegDataset expects numeric filename stems. Derive a numeric-only stem.
+    stem_raw = image_path.stem
+    out_stem = "".join([c for c in stem_raw if c.isdigit()])
+    assert out_stem.isdigit() and len(out_stem) > 0, f"Non-numeric stem derived from {stem_raw}"
+    mask = _rasterize_cable_mask(shapes, H, W, label)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = out_dir / f"{out_stem}.png"
+    # Write with Pillow
+    Image.fromarray(mask, mode="L").save(str(out_path))
+    return out_path
+def convert_ttpla_jsons_to_masks(input_path: str | Path, output_dir: str | Path, label: str = "cable", recursive: bool = True) -> List[Path]:
+    """Convert TTPLA LabelMe JSON annotations into binary masks matching WireSegHR conventions.
+    - input_path: directory containing JSONs (or a single .json file)
+    - output_dir: directory where .png masks will be written
+    - label: which label to rasterize (default: "cable")
+    - recursive: when input_path is a directory, whether to search recursively
+    Returns a list of written mask paths.
+    """
+    input_p = Path(input_path)
+    output_p = Path(output_dir)
+    if input_p.is_file():
+        assert input_p.suffix.lower() == ".json", f"Expected a .json file, got: {input_p}"
+        out = _convert_one(input_p, output_p, label)
+        return [out] if out else []
+    assert input_p.is_dir(), f"Input path must be a directory or a .json file: {input_p}"
+    json_iter: Iterable[Path]
+    if recursive:
+        json_iter = input_p.rglob("*.json")
+    else:
+        json_iter = input_p.glob("*.json")
+    written: List[Path] = []
+    for jp in sorted(json_iter):
+        w = _convert_one(jp, output_p, label)
+        if w is not None:
+            written.append(w)
+    return written
+def main(argv: List[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(description="Convert TTPLA LabelMe JSONs to WireSegHR-style binary masks")
+    parser.add_argument("--input", required=True, help="Path to a directory of JSONs or a single JSON file")
+    parser.add_argument("--output", required=True, help="Output directory for PNG masks")
+    parser.add_argument("--label", default="cable", help="Label to rasterize (default: cable)")
+    parser.add_argument("--no-recursive", action="store_true", help="Do not search subdirectories")
+    args = parser.parse_args(argv)
+    convert_ttpla_jsons_to_masks(
+        args.input,
+        args.output,
+        label=args.label,
+        recursive=(not args.no_recursive),
+    )
+if __name__ == "__main__":
+    main()

tests/test_ttpla_to_masks.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from pathlib import Path
+import json
+import numpy as np
+from PIL import Image
+from wireseghr.data.ttpla_to_masks import convert_ttpla_jsons_to_masks
+def _read_dims(json_path: Path):
+    with open(json_path, "r") as f:
+        data = json.load(f)
+    return int(data["imageHeight"]), int(data["imageWidth"]), Path(data["imagePath"]).stem
+def test_convert_single_json_cable_only(tmp_path: Path):
+    # Use the provided example JSON at repo root
+    src_json = Path("/workspace/wire-seg-hr-impl/1_00186.json")
+    assert src_json.exists()
+    H, W, stem = _read_dims(src_json)
+    out_dir = tmp_path / "masks"
+    written = convert_ttpla_jsons_to_masks(src_json, out_dir, label="cable")
+    assert len(written) == 1
+    out_path = written[0]
+    # Converter writes numeric-only stems
+    expected_stem = "".join([c for c in stem if c.isdigit()])
+    assert out_path.name == f"{expected_stem}.png"
+    assert out_path.exists()
+    mask = np.array(Image.open(out_path).convert("L"))
+    assert mask is not None
+    assert mask.shape == (H, W)
+    assert mask.dtype == np.uint8
+    # Binary with values in {0,255}
+    uniq = np.unique(mask)
+    assert all(int(v) in (0, 255) for v in uniq)
+    assert (mask > 0).any(), "Expected some positive pixels for cable"
+def test_convert_different_labels(tmp_path: Path):
+    src_json = Path("/workspace/wire-seg-hr-impl/1_00186.json")
+    assert src_json.exists()
+    out_dir_cable = tmp_path / "masks_cable"
+    out_dir_tower = tmp_path / "masks_tower"
+    written_cable = convert_ttpla_jsons_to_masks(src_json, out_dir_cable, label="cable")
+    written_tower = convert_ttpla_jsons_to_masks(src_json, out_dir_tower, label="tower_wooden")
+    mc = np.array(Image.open(written_cable[0]).convert("L"))
+    mt = np.array(Image.open(written_tower[0]).convert("L"))
+    # Both masks should have some positives and should not be identical
+    assert (mc > 0).any()
+    assert (mt > 0).any()
+    assert not np.array_equal(mc, mt)

train.py CHANGED Viewed

@@ -15,12 +15,12 @@ import random
 import torch.backends.cudnn as cudnn
 import cv2
-from wireseghr.model import WireSegHR
-from wireseghr.model.minmax import MinMaxLuminance
-from wireseghr.data.dataset import WireSegDataset
-from wireseghr.model.label_downsample import downsample_label_maxpool
-from wireseghr.data.sampler import BalancedPatchSampler
-from wireseghr.metrics import compute_metrics
 def main():

 import torch.backends.cudnn as cudnn
 import cv2
+from src.wireseghr.model import WireSegHR
+from src.wireseghr.model.minmax import MinMaxLuminance
+from src.wireseghr.data.dataset import WireSegDataset
+from src.wireseghr.model.label_downsample import downsample_label_maxpool
+from src.wireseghr.data.sampler import BalancedPatchSampler
+from src.wireseghr.metrics import compute_metrics
 def main():