youngPhilosopher commited on 7 days ago

Commit

b891e61

verified ·

1 Parent(s): 381f5f3

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/best_predictions.cpython-311.pyc +0 -0
src/__pycache__/evaluate.cpython-311.pyc +0 -0
src/__pycache__/train.cpython-311.pyc +0 -0
src/best_predictions.py +132 -0
src/data/__init__.py +0 -0
src/data/__pycache__/__init__.cpython-311.pyc +0 -0
src/data/__pycache__/dataset.cpython-311.pyc +0 -0
src/data/__pycache__/download.cpython-311.pyc +0 -0
src/data/__pycache__/preprocess.cpython-311.pyc +0 -0
src/data/dataset.py +92 -0
src/data/download.py +26 -0
src/data/preprocess.py +258 -0
src/evaluate.py +179 -0
src/model/__init__.py +0 -0
src/model/__pycache__/__init__.cpython-311.pyc +0 -0
src/model/__pycache__/clipseg_wrapper.cpython-311.pyc +0 -0
src/model/__pycache__/losses.cpython-311.pyc +0 -0
src/model/clipseg_wrapper.py +25 -0
src/model/losses.py +38 -0
src/predict.py +57 -0
src/train.py +194 -0

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (158 Bytes). View file

src/__pycache__/best_predictions.cpython-311.pyc ADDED Viewed

Binary file (8.72 kB). View file

src/__pycache__/evaluate.cpython-311.pyc ADDED Viewed

Binary file (12.4 kB). View file

src/__pycache__/train.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

src/best_predictions.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""Find best and worst predictions by per-sample IoU and generate showcase figures."""
+import json
+from pathlib import Path
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+def iou(pred: np.ndarray, gt: np.ndarray) -> float:
+    intersection = np.logical_and(pred, gt).sum()
+    union = np.logical_or(pred, gt).sum()
+    return float(intersection / union) if union > 0 else 0.0
+def score_all():
+    """Score every test prediction against ground truth. Returns dict of per-class scored lists."""
+    with open(PROJECT_ROOT / "data" / "splits" / "test.json") as f:
+        test_samples = json.load(f)
+    masks_dir = PROJECT_ROOT / "outputs" / "masks"
+    scores = {"taping": [], "cracks": []}
+    for sample in tqdm(test_samples, desc="Scoring predictions"):
+        img_stem = Path(sample["image_path"]).stem
+        ds = sample["dataset"]
+        candidates = list(masks_dir.glob(f"{img_stem}__*.png"))
+        if not candidates:
+            continue
+        gt = np.array(Image.open(sample["mask_path"]).convert("L"))
+        gt_bin = (gt > 127).astype(np.uint8)
+        best_iou = -1
+        best_pred_path = None
+        best_prompt = None
+        for pred_path in candidates:
+            pred = np.array(Image.open(pred_path).convert("L").resize(
+                (gt.shape[1], gt.shape[0]), Image.NEAREST))
+            pred_bin = (pred > 127).astype(np.uint8)
+            score = iou(pred_bin, gt_bin)
+            if score > best_iou:
+                best_iou = score
+                best_pred_path = pred_path
+                best_prompt = pred_path.stem.split("__")[1].replace("_", " ")
+        scores[ds].append({
+            "image_path": sample["image_path"],
+            "mask_path": sample["mask_path"],
+            "pred_path": str(best_pred_path),
+            "prompt": best_prompt,
+            "iou": best_iou,
+            "dataset": ds,
+        })
+    return scores
+def pick_ranked(scores, n_per_class=3, best=True):
+    """Pick top-N or bottom-N per class by IoU."""
+    result = []
+    for ds in ["cracks", "taping"]:
+        # Filter out zero-IoU (no prediction found) for worst — keep only actual failures
+        pool = [s for s in scores[ds] if s["iou"] > 0] if not best else scores[ds]
+        ranked = sorted(pool, key=lambda x: x["iou"], reverse=best)
+        selected = ranked[:n_per_class]
+        result.extend(selected)
+        label = "best" if best else "worst"
+        print(f"\n{ds} {label} {n_per_class}:")
+        for r in selected:
+            print(f"  IoU={r['iou']:.4f}  {Path(r['image_path']).name}  \"{r['prompt']}\"")
+    return result
+def generate_grid(examples, output_path, title=""):
+    """Generate original | ground truth | prediction comparison grid."""
+    n = len(examples)
+    fig, axes = plt.subplots(n, 3, figsize=(14, 4.0 * n))
+    if n == 1:
+        axes = [axes]
+    if title:
+        fig.suptitle(title, fontsize=16, fontweight="bold", y=0.998)
+    for i, ex in enumerate(examples):
+        img = Image.open(ex["image_path"]).convert("RGB")
+        gt = Image.open(ex["mask_path"]).convert("L")
+        pred = Image.open(ex["pred_path"]).convert("L").resize(
+            (gt.size[0], gt.size[1]), Image.NEAREST)
+        label = ex["dataset"].capitalize()
+        axes[i][0].imshow(img)
+        axes[i][0].set_title(f"Input — {label}", fontsize=11, fontweight="bold")
+        axes[i][0].axis("off")
+        axes[i][1].imshow(gt, cmap="gray", vmin=0, vmax=255)
+        axes[i][1].set_title("Ground Truth", fontsize=11)
+        axes[i][1].axis("off")
+        axes[i][2].imshow(pred, cmap="gray", vmin=0, vmax=255)
+        axes[i][2].set_title(f"Predicted — \"{ex['prompt']}\"  (IoU {ex['iou']:.2f})", fontsize=11)
+        axes[i][2].axis("off")
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150, bbox_inches="tight", facecolor="white")
+    plt.close()
+    print(f"Saved → {output_path}")
+if __name__ == "__main__":
+    figures_dir = PROJECT_ROOT / "reports" / "figures"
+    scores = score_all()
+    # Best predictions (3 per class)
+    best = pick_ranked(scores, n_per_class=3, best=True)
+    generate_grid(best, figures_dir / "best_predictions.png",
+                  title="Best Test-Set Predictions (by IoU)")
+    # Worst predictions (3 per class) — only samples where model actually predicted something
+    worst = pick_ranked(scores, n_per_class=3, best=False)
+    generate_grid(worst, figures_dir / "failure_cases.png",
+                  title="Failure Cases — Worst Test-Set Predictions (by IoU)")

src/data/__init__.py ADDED Viewed

File without changes

src/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (163 Bytes). View file

src/data/__pycache__/dataset.cpython-311.pyc ADDED Viewed

Binary file (6.37 kB). View file

src/data/__pycache__/download.cpython-311.pyc ADDED Viewed

Binary file (3.26 kB). View file

src/data/__pycache__/preprocess.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

src/data/dataset.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""PyTorch Dataset for CLIPSeg fine-tuning."""
+import json
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from transformers import CLIPSegProcessor
+class DrywallSegDataset(Dataset):
+    """Dataset that yields (image, mask, prompt) tuples for CLIPSeg."""
+    def __init__(self, split_json: str, processor: CLIPSegProcessor, image_size: int = 352):
+        with open(split_json) as f:
+            self.records = json.load(f)
+        self.processor = processor
+        self.image_size = image_size
+    def __len__(self):
+        return len(self.records)
+    def __getitem__(self, idx):
+        rec = self.records[idx]
+        # Load image
+        image = Image.open(rec["image_path"]).convert("RGB")
+        # Load mask and resize to CLIPSeg resolution
+        mask = Image.open(rec["mask_path"]).convert("L")
+        mask = mask.resize((self.image_size, self.image_size), Image.NEAREST)
+        mask_tensor = torch.from_numpy(np.array(mask)).float() / 255.0  # {0.0, 1.0}
+        # Random prompt synonym
+        prompt = random.choice(rec["prompts"])
+        # Process through CLIPSeg processor
+        inputs = self.processor(
+            text=[prompt],
+            images=[image],
+            return_tensors="pt",
+            padding=True,
+        )
+        return {
+            "pixel_values": inputs["pixel_values"].squeeze(0),
+            "input_ids": inputs["input_ids"].squeeze(0),
+            "attention_mask": inputs["attention_mask"].squeeze(0),
+            "labels": mask_tensor,
+            "dataset": rec["dataset"],
+            "image_path": rec["image_path"],
+            "mask_path": rec["mask_path"],
+            "prompt": prompt,
+            "orig_width": rec["width"],
+            "orig_height": rec["height"],
+        }
+def collate_fn(batch):
+    """Custom collation: pad input_ids and attention_mask to max length in batch."""
+    max_len = max(item["input_ids"].shape[0] for item in batch)
+    pixel_values = torch.stack([item["pixel_values"] for item in batch])
+    labels = torch.stack([item["labels"] for item in batch])
+    input_ids = []
+    attention_masks = []
+    for item in batch:
+        ids = item["input_ids"]
+        mask = item["attention_mask"]
+        pad_len = max_len - ids.shape[0]
+        if pad_len > 0:
+            ids = torch.cat([ids, torch.zeros(pad_len, dtype=ids.dtype)])
+            mask = torch.cat([mask, torch.zeros(pad_len, dtype=mask.dtype)])
+        input_ids.append(ids)
+        attention_masks.append(mask)
+    return {
+        "pixel_values": pixel_values,
+        "input_ids": torch.stack(input_ids),
+        "attention_mask": torch.stack(attention_masks),
+        "labels": labels,
+        "dataset": [item["dataset"] for item in batch],
+        "image_path": [item["image_path"] for item in batch],
+        "mask_path": [item["mask_path"] for item in batch],
+        "prompt": [item["prompt"] for item in batch],
+        "orig_width": [item["orig_width"] for item in batch],
+        "orig_height": [item["orig_height"] for item in batch],
+    }

src/data/download.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""Dataset download instructions.
+Both datasets must be downloaded manually from Roboflow Universe in COCO format.
+The Roboflow API cannot be used because the cracks dataset (cracks-3ii36) has
+no generated versions — the owner never created an exportable version.
+Download locations:
+  - Taping: https://universe.roboflow.com/objectdetect-pu6rn/drywall-join-detect
+    → Export as COCO, place under data/raw/taping/
+  - Cracks: https://universe.roboflow.com/fyp-ny1jt/cracks-3ii36
+    → Export as COCO, place under data/raw/cracks/
+Expected structure after download:
+  data/raw/
+  ├── taping/
+  │   ├── train/
+  │   │   ├── _annotations.coco.json
+  │   │   └── *.jpg
+  │   └── valid/
+  │       ├── _annotations.coco.json
+  │       └── *.jpg
+  └── cracks/
+      └── train/
+          ├── _annotations.coco.json
+          └── *.jpg
+"""

src/data/preprocess.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""Inspect annotations, generate masks, create train/val/test splits."""
+import json
+import random
+from pathlib import Path
+import numpy as np
+from PIL import Image
+from pycocotools.coco import COCO
+from pycocotools import mask as mask_utils
+RAW_DIR = Path(__file__).resolve().parents[2] / "data" / "raw"
+PROCESSED_DIR = Path(__file__).resolve().parents[2] / "data" / "processed"
+SPLITS_DIR = Path(__file__).resolve().parents[2] / "data" / "splits"
+def inspect_dataset(coco_json_path: str) -> dict:
+    """Check what annotation types exist in a COCO JSON file."""
+    with open(coco_json_path) as f:
+        data = json.load(f)
+    total = len(data.get("annotations", []))
+    has_seg = 0
+    has_bbox_only = 0
+    for ann in data.get("annotations", []):
+        seg = ann.get("segmentation")
+        if seg and isinstance(seg, list) and len(seg) > 0 and len(seg[0]) >= 6:
+            has_seg += 1
+        elif seg and isinstance(seg, dict):  # RLE format
+            has_seg += 1
+        else:
+            has_bbox_only += 1
+    return {
+        "total_annotations": total,
+        "total_images": len(data.get("images", [])),
+        "has_segmentation": has_seg,
+        "has_bbox_only": has_bbox_only,
+        "annotation_type": "segmentation" if has_seg > has_bbox_only else "bbox_only",
+        "categories": [c["name"] for c in data.get("categories", [])],
+    }
+def render_masks_from_coco(coco_json_path: str, images_dir: str, output_dir: str) -> list[dict]:
+    """Render binary masks from COCO polygon/RLE annotations.
+    Returns list of {image_path, mask_path, image_id, width, height}.
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    coco = COCO(coco_json_path)
+    records = []
+    for img_id in sorted(coco.getImgIds()):
+        img_info = coco.loadImgs(img_id)[0]
+        h, w = img_info["height"], img_info["width"]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        anns = coco.loadAnns(ann_ids)
+        if not anns:
+            continue
+        # Merge all annotations into one binary mask
+        combined = np.zeros((h, w), dtype=np.uint8)
+        for ann in anns:
+            seg = ann.get("segmentation")
+            # Skip annotations with empty or invalid segmentation
+            if not seg:
+                continue
+            if isinstance(seg, list) and (len(seg) == 0 or (len(seg) > 0 and isinstance(seg[0], list) and len(seg[0]) < 6)):
+                continue
+            if isinstance(seg, list) and len(seg) > 0 and not isinstance(seg[0], list) and len(seg) < 6:
+                continue
+            try:
+                rle = coco.annToRLE(ann)
+                m = mask_utils.decode(rle)
+                combined = np.maximum(combined, m)
+            except (IndexError, ValueError):
+                # Fall back to bbox if segmentation decode fails
+                if "bbox" in ann:
+                    x, y, bw, bh = [int(v) for v in ann["bbox"]]
+                    combined[y:y+bh, x:x+bw] = 1
+        mask_img = Image.fromarray(combined * 255, mode="L")
+        mask_name = Path(img_info["file_name"]).stem + "_mask.png"
+        mask_path = output_dir / mask_name
+        mask_img.save(mask_path)
+        image_path = Path(images_dir) / img_info["file_name"]
+        records.append({
+            "image_path": str(image_path),
+            "mask_path": str(mask_path),
+            "image_id": img_id,
+            "width": w,
+            "height": h,
+        })
+    return records
+def render_masks_from_bboxes(coco_json_path: str, images_dir: str, output_dir: str) -> list[dict]:
+    """Create filled-rectangle masks from bounding boxes (fallback when no segmentation)."""
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    with open(coco_json_path) as f:
+        data = json.load(f)
+    img_lookup = {img["id"]: img for img in data["images"]}
+    anns_by_img: dict[int, list] = {}
+    for ann in data["annotations"]:
+        anns_by_img.setdefault(ann["image_id"], []).append(ann)
+    records = []
+    for img_id, img_info in sorted(img_lookup.items()):
+        anns = anns_by_img.get(img_id, [])
+        if not anns:
+            continue
+        h, w = img_info["height"], img_info["width"]
+        combined = np.zeros((h, w), dtype=np.uint8)
+        for ann in anns:
+            x, y, bw, bh = [int(v) for v in ann["bbox"]]
+            combined[y:y+bh, x:x+bw] = 1
+        mask_img = Image.fromarray(combined * 255, mode="L")
+        mask_name = Path(img_info["file_name"]).stem + "_mask.png"
+        mask_path = output_dir / mask_name
+        mask_img.save(mask_path)
+        image_path = Path(images_dir) / img_info["file_name"]
+        records.append({
+            "image_path": str(image_path),
+            "mask_path": str(mask_path),
+            "image_id": img_id,
+            "width": w,
+            "height": h,
+        })
+    return records
+def find_coco_json(dataset_dir: Path) -> tuple[str, str] | None:
+    """Find the COCO JSON and images directory in a Roboflow download."""
+    for split in ["train", "valid", "test"]:
+        json_path = dataset_dir / split / "_annotations.coco.json"
+        if json_path.exists():
+            return str(json_path), str(dataset_dir / split)
+    # Single-folder layout
+    for json_path in dataset_dir.rglob("_annotations.coco.json"):
+        return str(json_path), str(json_path.parent)
+    return None
+def process_dataset(name: str, dataset_dir: Path, prompt_synonyms: list[str]) -> list[dict]:
+    """Process a single dataset: inspect, render masks, return records with prompts."""
+    records = []
+    mask_dir = PROCESSED_DIR / name / "masks"
+    # Process each split folder (train/valid/test from Roboflow)
+    for split_dir in sorted(dataset_dir.iterdir()):
+        if not split_dir.is_dir():
+            continue
+        json_path = split_dir / "_annotations.coco.json"
+        if not json_path.exists():
+            continue
+        print(f"\n  Processing {name}/{split_dir.name}...")
+        info = inspect_dataset(str(json_path))
+        print(f"    Images: {info['total_images']}, Annotations: {info['total_annotations']}")
+        print(f"    Type: {info['annotation_type']}, Categories: {info['categories']}")
+        split_mask_dir = mask_dir / split_dir.name
+        if info["annotation_type"] == "segmentation":
+            split_records = render_masks_from_coco(
+                str(json_path), str(split_dir), str(split_mask_dir)
+            )
+        else:
+            print(f"    WARNING: bbox-only annotations, using filled rectangles")
+            split_records = render_masks_from_bboxes(
+                str(json_path), str(split_dir), str(split_mask_dir)
+            )
+        for r in split_records:
+            r["dataset"] = name
+            r["prompts"] = prompt_synonyms
+        records.extend(split_records)
+    return records
+def create_splits(records: list[dict], ratios: tuple = (0.70, 0.15, 0.15), seed: int = 42):
+    """Split records into train/val/test, stratified by dataset."""
+    random.seed(seed)
+    by_dataset: dict[str, list] = {}
+    for r in records:
+        by_dataset.setdefault(r["dataset"], []).append(r)
+    train, val, test = [], [], []
+    for name, recs in by_dataset.items():
+        random.shuffle(recs)
+        n = len(recs)
+        n_train = int(n * ratios[0])
+        n_val = int(n * ratios[1])
+        train.extend(recs[:n_train])
+        val.extend(recs[n_train:n_train + n_val])
+        test.extend(recs[n_train + n_val:])
+    random.shuffle(train)
+    random.shuffle(val)
+    random.shuffle(test)
+    SPLITS_DIR.mkdir(parents=True, exist_ok=True)
+    for split_name, split_data in [("train", train), ("val", val), ("test", test)]:
+        path = SPLITS_DIR / f"{split_name}.json"
+        with open(path, "w") as f:
+            json.dump(split_data, f, indent=2)
+        print(f"  {split_name}: {len(split_data)} samples -> {path}")
+    return {"train": train, "val": val, "test": test}
+def run(config: dict):
+    """Run full preprocessing pipeline."""
+    synonyms = config["data"]["prompt_synonyms"]
+    ratios = tuple(config["data"]["split_ratios"])
+    all_records = []
+    for name in ["taping", "cracks"]:
+        dataset_dir = RAW_DIR / name
+        if not dataset_dir.exists():
+            print(f"WARNING: {dataset_dir} not found, skipping {name}")
+            continue
+        print(f"\n{'='*60}")
+        print(f"Processing dataset: {name}")
+        print(f"{'='*60}")
+        records = process_dataset(name, dataset_dir, synonyms[name])
+        all_records.extend(records)
+        print(f"  Total records for {name}: {len(records)}")
+    print(f"\n{'='*60}")
+    print(f"Creating splits (total: {len(all_records)} records)")
+    print(f"{'='*60}")
+    splits = create_splits(all_records, ratios=ratios, seed=config["seed"])
+    return splits
+if __name__ == "__main__":
+    import yaml
+    config_path = Path(__file__).resolve().parents[2] / "configs" / "train_config.yaml"
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    run(config)

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Evaluate trained CLIPSeg model and generate prediction masks + visuals."""
+import json
+import time
+from pathlib import Path
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import yaml
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from src.data.dataset import DrywallSegDataset, collate_fn
+from src.model.clipseg_wrapper import load_model_and_processor
+from src.train import compute_metrics, get_device
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+def evaluate(config_path: str | None = None):
+    config_path = config_path or str(PROJECT_ROOT / "configs" / "train_config.yaml")
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    device = get_device()
+    threshold = config["evaluation"]["threshold"]
+    # Load model with best checkpoint
+    model, processor = load_model_and_processor(config["model"]["name"], config["model"]["freeze_backbone"])
+    ckpt_path = PROJECT_ROOT / "outputs" / "checkpoints" / "best_model.pt"
+    model.load_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
+    model = model.to(device)
+    model.eval()
+    # Model size
+    model_size_mb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024)
+    # Test data
+    splits_dir = PROJECT_ROOT / "data" / "splits"
+    test_ds = DrywallSegDataset(str(splits_dir / "test.json"), processor, config["data"]["image_size"])
+    test_loader = DataLoader(test_ds, batch_size=config["training"]["batch_size"], shuffle=False,
+                             collate_fn=collate_fn, num_workers=0)
+    # Run evaluation
+    masks_dir = PROJECT_ROOT / "outputs" / "masks"
+    masks_dir.mkdir(parents=True, exist_ok=True)
+    all_metrics = {"taping": {"miou": [], "dice": []}, "cracks": {"miou": [], "dice": []}}
+    inference_times = []
+    visual_examples = []  # Collect for visualization
+    total_samples = 0
+    with torch.no_grad():
+        for batch in tqdm(test_loader, desc="Evaluating"):
+            pixel_values = batch["pixel_values"].to(device)
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["labels"].to(device)
+            t0 = time.time()
+            outputs = model(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask)
+            inference_times.append((time.time() - t0) / pixel_values.size(0))
+            logits = outputs.logits
+            metrics = compute_metrics(logits, labels, threshold)
+            preds = (torch.sigmoid(logits) > threshold).cpu().numpy().astype(np.uint8)
+            for i in range(pixel_values.size(0)):
+                ds_name = batch["dataset"][i]
+                all_metrics[ds_name]["miou"].append(metrics["miou"])
+                all_metrics[ds_name]["dice"].append(metrics["dice"])
+                # Save prediction mask at original resolution
+                orig_w, orig_h = batch["orig_width"][i], batch["orig_height"][i]
+                pred_mask = Image.fromarray(preds[i] * 255, mode="L")
+                pred_mask = pred_mask.resize((orig_w, orig_h), Image.NEAREST)
+                prompt_slug = batch["prompt"][i].replace(" ", "_")
+                img_stem = Path(batch["image_path"][i]).stem
+                mask_filename = f"{img_stem}__{prompt_slug}.png"
+                pred_mask.save(masks_dir / mask_filename)
+                total_samples += 1
+                # Collect visual examples
+                if len(visual_examples) < config["evaluation"]["num_visual_examples"]:
+                    visual_examples.append({
+                        "image_path": batch["image_path"][i],
+                        "mask_path": batch["mask_path"][i],
+                        "pred_mask": preds[i],
+                        "prompt": batch["prompt"][i],
+                        "dataset": ds_name,
+                    })
+    # Aggregate metrics
+    results = {"per_class": {}, "overall": {}}
+    all_miou, all_dice = [], []
+    for ds_name in ["taping", "cracks"]:
+        m = all_metrics[ds_name]
+        if m["miou"]:
+            results["per_class"][ds_name] = {
+                "miou": round(float(np.mean(m["miou"])), 4),
+                "dice": round(float(np.mean(m["dice"])), 4),
+                "samples": len(m["miou"]),
+            }
+            all_miou.extend(m["miou"])
+            all_dice.extend(m["dice"])
+    results["overall"] = {
+        "miou": round(float(np.mean(all_miou)), 4) if all_miou else 0,
+        "dice": round(float(np.mean(all_dice)), 4) if all_dice else 0,
+        "total_samples": total_samples,
+    }
+    results["runtime"] = {
+        "avg_inference_ms": round(float(np.mean(inference_times)) * 1000, 1),
+        "model_size_mb": round(model_size_mb, 1),
+    }
+    # Save results
+    log_dir = PROJECT_ROOT / "outputs" / "logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    with open(log_dir / "test_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\n{'='*60}")
+    print(f"Test Results")
+    print(f"{'='*60}")
+    for ds_name, m in results["per_class"].items():
+        print(f"  {ds_name:>10s}: mIoU={m['miou']:.4f}  Dice={m['dice']:.4f}  (n={m['samples']})")
+    print(f"  {'overall':>10s}: mIoU={results['overall']['miou']:.4f}  Dice={results['overall']['dice']:.4f}")
+    print(f"  Avg inference: {results['runtime']['avg_inference_ms']:.1f} ms/image")
+    print(f"  Model size: {results['runtime']['model_size_mb']:.1f} MB")
+    # Generate visual comparison figures
+    _generate_visuals(visual_examples, PROJECT_ROOT / "reports" / "figures")
+    return results
+def _generate_visuals(examples: list[dict], output_dir: Path):
+    """Generate original | GT | prediction comparison figures."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if not examples:
+        return
+    fig, axes = plt.subplots(len(examples), 3, figsize=(12, 4 * len(examples)))
+    if len(examples) == 1:
+        axes = [axes]
+    for i, ex in enumerate(examples):
+        img = Image.open(ex["image_path"]).convert("RGB")
+        gt = Image.open(ex["mask_path"]).convert("L")
+        pred = Image.fromarray(ex["pred_mask"] * 255, mode="L")
+        axes[i][0].imshow(img)
+        axes[i][0].set_title(f"Original ({ex['dataset']})")
+        axes[i][0].axis("off")
+        axes[i][1].imshow(gt, cmap="gray", vmin=0, vmax=255)
+        axes[i][1].set_title("Ground Truth")
+        axes[i][1].axis("off")
+        axes[i][2].imshow(pred, cmap="gray", vmin=0, vmax=255)
+        axes[i][2].set_title(f"Prediction: \"{ex['prompt']}\"")
+        axes[i][2].axis("off")
+    plt.tight_layout()
+    plt.savefig(output_dir / "visual_comparison.png", dpi=150, bbox_inches="tight")
+    plt.close()
+    print(f"Saved visual comparison to {output_dir / 'visual_comparison.png'}")
+if __name__ == "__main__":
+    evaluate()

src/model/__init__.py ADDED Viewed

File without changes

src/model/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (164 Bytes). View file

src/model/__pycache__/clipseg_wrapper.cpython-311.pyc ADDED Viewed

Binary file (1.89 kB). View file

src/model/__pycache__/losses.cpython-311.pyc ADDED Viewed

Binary file (3.34 kB). View file

src/model/clipseg_wrapper.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""CLIPSeg model loading and freezing utilities."""
+from transformers import CLIPSegForImageSegmentation, CLIPSegProcessor
+def load_model_and_processor(model_name: str = "CIDAS/clipseg-rd64-refined", freeze_backbone: bool = True):
+    """Load CLIPSeg model and processor, optionally freezing the backbone."""
+    model = CLIPSegForImageSegmentation.from_pretrained(model_name)
+    processor = CLIPSegProcessor.from_pretrained(model_name)
+    if freeze_backbone:
+        trainable, frozen = 0, 0
+        for name, param in model.named_parameters():
+            if "decoder" in name:
+                param.requires_grad = True
+                trainable += param.numel()
+            else:
+                param.requires_grad = False
+                frozen += param.numel()
+        print(f"Parameters — trainable (decoder): {trainable:,} | frozen (backbone): {frozen:,}")
+    else:
+        trainable = sum(p.numel() for p in model.parameters())
+        print(f"Parameters — all trainable: {trainable:,}")
+    return model, processor

src/model/losses.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Custom loss functions for segmentation."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DiceLoss(nn.Module):
+    """Soft Dice loss operating on logits."""
+    def __init__(self, smooth: float = 1.0):
+        super().__init__()
+        self.smooth = smooth
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        probs = torch.sigmoid(logits)
+        probs_flat = probs.view(probs.size(0), -1)
+        targets_flat = targets.view(targets.size(0), -1)
+        intersection = (probs_flat * targets_flat).sum(dim=1)
+        union = probs_flat.sum(dim=1) + targets_flat.sum(dim=1)
+        dice = (2.0 * intersection + self.smooth) / (union + self.smooth)
+        return 1.0 - dice.mean()
+class BCEDiceLoss(nn.Module):
+    """Weighted combination of BCE and Dice loss."""
+    def __init__(self, bce_weight: float = 0.5, dice_weight: float = 0.5):
+        super().__init__()
+        self.bce_weight = bce_weight
+        self.dice_weight = dice_weight
+        self.bce = nn.BCEWithLogitsLoss()
+        self.dice = DiceLoss()
+    def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        return self.bce_weight * self.bce(logits, targets) + self.dice_weight * self.dice(logits, targets)

src/predict.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Standalone single-image inference for CLIPSeg."""
+import argparse
+from pathlib import Path
+import numpy as np
+import torch
+import yaml
+from PIL import Image
+from src.model.clipseg_wrapper import load_model_and_processor
+from src.train import get_device
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+def predict(image_path: str, prompt: str, config_path: str | None = None, output_path: str | None = None):
+    config_path = config_path or str(PROJECT_ROOT / "configs" / "train_config.yaml")
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    device = get_device()
+    model, processor = load_model_and_processor(config["model"]["name"], config["model"]["freeze_backbone"])
+    ckpt = PROJECT_ROOT / "outputs" / "checkpoints" / "best_model.pt"
+    model.load_state_dict(torch.load(ckpt, map_location="cpu", weights_only=True))
+    model = model.to(device).eval()
+    image = Image.open(image_path).convert("RGB")
+    orig_w, orig_h = image.size
+    inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    pred = (torch.sigmoid(logits[0]) > config["evaluation"]["threshold"]).cpu().numpy().astype(np.uint8)
+    mask = Image.fromarray(pred * 255, mode="L").resize((orig_w, orig_h), Image.NEAREST)
+    if output_path is None:
+        stem = Path(image_path).stem
+        slug = prompt.replace(" ", "_")
+        output_path = str(PROJECT_ROOT / "outputs" / "masks" / f"{stem}__{slug}.png")
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    mask.save(output_path)
+    print(f"Saved mask to {output_path}")
+    return mask
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("image", help="Path to input image")
+    parser.add_argument("prompt", help="Text prompt, e.g. 'segment crack'")
+    parser.add_argument("--output", help="Output mask path")
+    args = parser.parse_args()
+    predict(args.image, args.prompt, output_path=args.output)

src/train.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""Training loop for CLIPSeg fine-tuning."""
+import json
+import time
+from pathlib import Path
+import numpy as np
+import torch
+import yaml
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from src.data.dataset import DrywallSegDataset, collate_fn
+from src.model.clipseg_wrapper import load_model_and_processor
+from src.model.losses import BCEDiceLoss
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+def compute_metrics(logits: torch.Tensor, targets: torch.Tensor, threshold: float = 0.5):
+    """Compute mIoU and Dice for a batch."""
+    preds = (torch.sigmoid(logits) > threshold).float()
+    targets = (targets > 0.5).float()
+    intersection = (preds * targets).sum(dim=(1, 2))
+    union = preds.sum(dim=(1, 2)) + targets.sum(dim=(1, 2)) - intersection
+    iou = (intersection + 1e-6) / (union + 1e-6)
+    dice = (2 * intersection + 1e-6) / (preds.sum(dim=(1, 2)) + targets.sum(dim=(1, 2)) + 1e-6)
+    return {"miou": iou.mean().item(), "dice": dice.mean().item()}
+def get_device():
+    """Select best available device."""
+    if torch.backends.mps.is_available():
+        return torch.device("mps")
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    return torch.device("cpu")
+def train(config_path: str | None = None):
+    config_path = config_path or str(PROJECT_ROOT / "configs" / "train_config.yaml")
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    # Seed
+    seed = config["seed"]
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    device = get_device()
+    print(f"Device: {device}")
+    # Model
+    model, processor = load_model_and_processor(
+        config["model"]["name"],
+        config["model"]["freeze_backbone"],
+    )
+    model = model.to(device)
+    # Data
+    splits_dir = PROJECT_ROOT / "data" / "splits"
+    train_ds = DrywallSegDataset(str(splits_dir / "train.json"), processor, config["data"]["image_size"])
+    val_ds = DrywallSegDataset(str(splits_dir / "val.json"), processor, config["data"]["image_size"])
+    tc = config["training"]
+    train_loader = DataLoader(train_ds, batch_size=tc["batch_size"], shuffle=True,
+                              collate_fn=collate_fn, num_workers=tc["num_workers"])
+    val_loader = DataLoader(val_ds, batch_size=tc["batch_size"], shuffle=False,
+                            collate_fn=collate_fn, num_workers=tc["num_workers"])
+    # Loss, optimizer, scheduler
+    criterion = BCEDiceLoss(tc["bce_weight"], tc["dice_weight"])
+    optimizer = AdamW(
+        [p for p in model.parameters() if p.requires_grad],
+        lr=tc["lr"],
+        weight_decay=tc["weight_decay"],
+    )
+    scheduler = CosineAnnealingLR(optimizer, T_max=tc["epochs"])
+    # Training state
+    best_miou = 0.0
+    patience_counter = 0
+    history = {"train_loss": [], "val_loss": [], "val_miou": [], "val_dice": []}
+    ckpt_dir = PROJECT_ROOT / "outputs" / "checkpoints"
+    ckpt_dir.mkdir(parents=True, exist_ok=True)
+    log_dir = PROJECT_ROOT / "outputs" / "logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    start_time = time.time()
+    for epoch in range(1, tc["epochs"] + 1):
+        # ---- Train ----
+        model.train()
+        train_losses = []
+        for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{tc['epochs']} [train]", leave=False):
+            pixel_values = batch["pixel_values"].to(device)
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["labels"].to(device)
+            outputs = model(
+                pixel_values=pixel_values,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
+            logits = outputs.logits
+            loss = criterion(logits, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            train_losses.append(loss.item())
+        scheduler.step()
+        avg_train_loss = np.mean(train_losses)
+        # ---- Validate ----
+        model.eval()
+        val_losses, val_mious, val_dices = [], [], []
+        with torch.no_grad():
+            for batch in tqdm(val_loader, desc=f"Epoch {epoch}/{tc['epochs']} [val]", leave=False):
+                pixel_values = batch["pixel_values"].to(device)
+                input_ids = batch["input_ids"].to(device)
+                attention_mask = batch["attention_mask"].to(device)
+                labels = batch["labels"].to(device)
+                outputs = model(
+                    pixel_values=pixel_values,
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                )
+                logits = outputs.logits
+                loss = criterion(logits, labels)
+                metrics = compute_metrics(logits, labels)
+                val_losses.append(loss.item())
+                val_mious.append(metrics["miou"])
+                val_dices.append(metrics["dice"])
+        avg_val_loss = np.mean(val_losses)
+        avg_val_miou = np.mean(val_mious)
+        avg_val_dice = np.mean(val_dices)
+        history["train_loss"].append(float(avg_train_loss))
+        history["val_loss"].append(float(avg_val_loss))
+        history["val_miou"].append(float(avg_val_miou))
+        history["val_dice"].append(float(avg_val_dice))
+        print(f"Epoch {epoch:3d} | train_loss={avg_train_loss:.4f} | val_loss={avg_val_loss:.4f} | "
+              f"val_mIoU={avg_val_miou:.4f} | val_Dice={avg_val_dice:.4f}")
+        # Checkpoint
+        if avg_val_miou > best_miou:
+            best_miou = avg_val_miou
+            patience_counter = 0
+            torch.save(model.state_dict(), ckpt_dir / "best_model.pt")
+            print(f"  -> New best mIoU: {best_miou:.4f}, saved checkpoint")
+        else:
+            patience_counter += 1
+            if patience_counter >= tc["patience"]:
+                print(f"  Early stopping at epoch {epoch} (patience={tc['patience']})")
+                break
+    total_time = time.time() - start_time
+    # Save history & summary
+    with open(log_dir / "training_history.json", "w") as f:
+        json.dump(history, f, indent=2)
+    summary = {
+        "total_epochs": epoch,
+        "best_val_miou": float(best_miou),
+        "total_time_seconds": round(total_time, 1),
+        "total_time_minutes": round(total_time / 60, 1),
+        "device": str(device),
+        "train_samples": len(train_ds),
+        "val_samples": len(val_ds),
+        "seed": seed,
+    }
+    with open(log_dir / "training_summary.json", "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"\nTraining complete in {summary['total_time_minutes']} min")
+    print(f"Best val mIoU: {best_miou:.4f}")
+    return model, history
+if __name__ == "__main__":
+    train()