youngPhilosopher commited on Apr 10

Commit

451837b

verified ·

1 Parent(s): b891e61

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +8 -0
reports/diagrams/Training Pipeline.png +0 -0
reports/diagrams/architecture.png +3 -0
reports/diagrams/architecture.py +54 -0
reports/diagrams/pipeline.d2 +69 -0
reports/diagrams/pipeline.png +3 -0
reports/diagrams/repo_structure.d2 +131 -0
reports/diagrams/repo_structure.png +3 -0
reports/diagrams/training.puml +64 -0
reports/figures/best_predictions.png +3 -0
reports/figures/failure_cases.png +3 -0
reports/figures/readme_showcase.png +3 -0
reports/figures/visual_comparison.png +3 -0
reports/report.pdf +3 -0
reports/report.typ +251 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+reports/diagrams/architecture.png filter=lfs diff=lfs merge=lfs -text
+reports/diagrams/pipeline.png filter=lfs diff=lfs merge=lfs -text
+reports/diagrams/repo_structure.png filter=lfs diff=lfs merge=lfs -text
+reports/figures/best_predictions.png filter=lfs diff=lfs merge=lfs -text
+reports/figures/failure_cases.png filter=lfs diff=lfs merge=lfs -text
+reports/figures/readme_showcase.png filter=lfs diff=lfs merge=lfs -text
+reports/figures/visual_comparison.png filter=lfs diff=lfs merge=lfs -text
+reports/report.pdf filter=lfs diff=lfs merge=lfs -text

reports/diagrams/Training Pipeline.png ADDED Viewed

reports/diagrams/architecture.png ADDED Viewed

Git LFS Details

SHA256: 4867045edbcc01f6c41f2d48ac81f8858dab680c3ba67603d2633764aa7f0ea1
Pointer size: 131 Bytes
Size of remote file: 189 kB

reports/diagrams/architecture.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Generate CLIPSeg architecture diagram using the diagrams library."""
+from pathlib import Path
+from diagrams import Diagram, Cluster, Edge
+from diagrams.custom import Custom
+from diagrams.generic.blank import Blank
+from diagrams.programming.framework import React  # placeholder icons
+from diagrams.generic.compute import Rack
+from diagrams.generic.storage import Storage
+from diagrams.generic.database import SQL
+OUTPUT_DIR = Path(__file__).resolve().parent
+# Use generic shapes since diagrams lib is infrastructure-focused
+# We'll create a clean conceptual diagram
+with Diagram(
+    "CLIPSeg Architecture for Drywall QA",
+    filename=str(OUTPUT_DIR / "architecture"),
+    show=False,
+    direction="LR",
+    graph_attr={"fontsize": "16", "bgcolor": "white", "pad": "0.5"},
+):
+    with Cluster("Input"):
+        image_input = Rack("Image\n(RGB)")
+        text_input = Storage('Text Prompt\n"segment crack"')
+    with Cluster("CLIP Backbone (Frozen — 149.6M params)", graph_attr={"style": "dashed", "color": "#1565C0"}):
+        with Cluster("Vision Encoder"):
+            vis_enc = Rack("ViT-B/16\nPatch Embeddings\n+ Transformer")
+        with Cluster("Text Encoder"):
+            txt_enc = Storage("Token Embeddings\n+ Transformer")
+    with Cluster("Decoder (Trainable — 1.13M params)", graph_attr={"style": "filled", "color": "#C8E6C9"}):
+        decoder = SQL("3× Transformer\nDecoder Blocks\n(U-Net skip)")
+    with Cluster("Output"):
+        logits = Rack("Logits\n352×352")
+        mask = Storage("Binary Mask\n{0, 255}")
+    image_input >> Edge(label="pixel_values") >> vis_enc
+    text_input >> Edge(label="input_ids") >> txt_enc
+    vis_enc >> Edge(label="visual features") >> decoder
+    txt_enc >> Edge(label="text conditioning") >> decoder
+    decoder >> Edge(label="sigmoid + threshold") >> logits
+    logits >> mask
+if __name__ == "__main__":
+    print(f"Architecture diagram saved to {OUTPUT_DIR / 'architecture.png'}")

reports/diagrams/pipeline.d2 ADDED Viewed

	@@ -0,0 +1,69 @@

+title: |md
+  # Data Pipeline
+|
+roboflow: Roboflow Universe {
+  shape: cloud
+  d1: Dataset 1\nDrywall-Join-Detect\n(1,186 images)
+  d2: Dataset 2\nCracks\n(5,369 images)
+}
+download: Download\n(COCO format) {
+  shape: step
+}
+inspect: Inspect\nAnnotations {
+  shape: diamond
+}
+bbox_mask: BBox → Filled\nRectangle Masks {
+  shape: step
+  style.fill: "#FFE0B2"
+}
+poly_mask: Polygon →\nBinary Masks {
+  shape: step
+  style.fill: "#C8E6C9"
+}
+unified: |md
+  Unified Manifest
+  (image, mask, prompts)
+| {
+  shape: document
+}
+split: Stratified Split\n70/15/15 {
+  shape: step
+}
+train_set: Train\n4,588 {
+  shape: cylinder
+  style.fill: "#BBDEFB"
+}
+val_set: Val\n982 {
+  shape: cylinder
+  style.fill: "#BBDEFB"
+}
+test_set: Test\n985 {
+  shape: cylinder
+  style.fill: "#BBDEFB"
+}
+roboflow.d1 -> download
+roboflow.d2 -> download
+download -> inspect
+inspect -> bbox_mask: bbox only\n(taping)
+inspect -> poly_mask: segmentation\n(cracks)
+bbox_mask -> unified
+poly_mask -> unified
+unified -> split
+split -> train_set
+split -> val_set
+split -> test_set

reports/diagrams/pipeline.png ADDED Viewed

Git LFS Details

SHA256: bb77ea5a3c6980bf0f389ec2f4d49331ac6126f98307021ad2531aa002b1b1c9
Pointer size: 131 Bytes
Size of remote file: 311 kB

reports/diagrams/repo_structure.d2 ADDED Viewed

	@@ -0,0 +1,131 @@

+direction: right
+configs: "configs/" {
+  shape: rectangle
+  style.fill: "#FFF3E0"
+  style.stroke: "#FB8C00"
+  style.border-radius: 6
+  style.font-size: 16
+  style.bold: true
+  yaml: "train_config.yaml\nAll hyperparameters" {
+    shape: page
+    style.fill: "#FFE0B2"
+  }
+}
+src: "src/" {
+  shape: rectangle
+  style.fill: "#E3F2FD"
+  style.stroke: "#1E88E5"
+  style.border-radius: 6
+  style.font-size: 16
+  style.bold: true
+  data: "data/" {
+    style.fill: "#BBDEFB"
+    style.border-radius: 4
+    p1: "preprocess.py\nMask rendering & splits" { shape: page; style.fill: "#E3F2FD" }
+    p2: "dataset.py\nPyTorch Dataset" { shape: page; style.fill: "#E3F2FD" }
+  }
+  model: "model/" {
+    style.fill: "#BBDEFB"
+    style.border-radius: 4
+    m1: "clipseg_wrapper.py\nModel loading & freezing" { shape: page; style.fill: "#E3F2FD" }
+    m2: "losses.py\nBCEDiceLoss" { shape: page; style.fill: "#E3F2FD" }
+  }
+  scripts: "Pipeline Scripts" {
+    style.fill: "#BBDEFB"
+    style.border-radius: 4
+    s1: "train.py" { shape: page; style.fill: "#E3F2FD" }
+    s2: "evaluate.py" { shape: page; style.fill: "#E3F2FD" }
+    s3: "predict.py" { shape: page; style.fill: "#E3F2FD" }
+  }
+}
+reports: "reports/" {
+  shape: rectangle
+  style.fill: "#E8F5E9"
+  style.stroke: "#43A047"
+  style.border-radius: 6
+  style.font-size: 16
+  style.bold: true
+  diag: "diagrams/" {
+    style.fill: "#C8E6C9"
+    style.border-radius: 4
+    d1: "architecture.py" { shape: page; style.fill: "#E8F5E9" }
+    d2_file: "pipeline.d2" { shape: page; style.fill: "#E8F5E9" }
+    d3: "training.puml" { shape: page; style.fill: "#E8F5E9" }
+  }
+  fig: "figures/" {
+    style.fill: "#C8E6C9"
+    style.border-radius: 4
+  }
+  pdf: "report.typ → PDF" {
+    shape: page
+    style.fill: "#A5D6A7"
+    style.font-size: 15
+    style.bold: true
+  }
+}
+outputs: "outputs/" {
+  shape: rectangle
+  style.fill: "#F3E5F5"
+  style.stroke: "#8E24AA"
+  style.border-radius: 6
+  style.font-size: 16
+  style.bold: true
+  ckpt: "checkpoints/\nbest_model.pt (575 MB)" {
+    shape: cylinder
+    style.fill: "#CE93D8"
+  }
+  masks: "masks/\n985 prediction PNGs" {
+    shape: stored_data
+    style.fill: "#CE93D8"
+  }
+  logs: "logs/\ntraining & test JSON" {
+    style.fill: "#E1BEE7"
+    style.border-radius: 4
+  }
+}
+data: "data/" {
+  shape: rectangle
+  style.fill: "#FFF9C4"
+  style.stroke: "#FDD835"
+  style.border-radius: 6
+  style.font-size: 16
+  style.bold: true
+  raw: "raw/\nCOCO datasets" {
+    style.fill: "#FFF59D"
+    style.border-radius: 4
+  }
+  processed: "processed/\nBinary masks" {
+    style.fill: "#FFF59D"
+    style.border-radius: 4
+  }
+  splits: "splits/\ntrain.json · val.json · test.json" {
+    style.fill: "#FFF59D"
+    style.border-radius: 4
+  }
+}
+# Flow connections
+configs -> src: "configure" { style.stroke-dash: 5; style.stroke: "#9E9E9E" }
+src -> outputs: "train & evaluate" { style.stroke: "#1E88E5" }
+src -> reports: "generate" { style.stroke: "#43A047" }
+data -> src: "feed" { style.stroke: "#FDD835" }

reports/diagrams/repo_structure.png ADDED Viewed

Git LFS Details

SHA256: 97279f1b9376b113362d22eb7ca2c8603c67443da429350079040b460ba37a04
Pointer size: 131 Bytes
Size of remote file: 588 kB

reports/diagrams/training.puml ADDED Viewed

	@@ -0,0 +1,64 @@

+@startuml Training Pipeline
+!theme plain
+skinparam backgroundColor #FFFFFF
+skinparam activityBackgroundColor #E3F2FD
+skinparam activityBorderColor #1565C0
+title CLIPSeg Fine-Tuning Pipeline
+start
+:Load pretrained CLIPSeg
+(CIDAS/clipseg-rd64-refined);
+:Freeze CLIP backbone
+(149.6M params frozen);
+:Initialize decoder training
+(1.13M params trainable);
+:Configure optimizer
+AdamW (lr=1e-4, wd=1e-4)
+CosineAnnealingLR scheduler;
+repeat
+  :Forward pass
+  image + text prompt → logits (352×352);
+  :Compute BCEDiceLoss
+  0.5×BCE + 0.5×Dice;
+  :Backward pass + optimizer step;
+  if (End of epoch?) then (yes)
+    :Validate on val set
+    Compute mIoU & Dice;
+    if (val mIoU improved?) then (yes)
+      :Save best checkpoint;
+      :Reset patience counter;
+    else (no)
+      :Increment patience counter;
+    endif
+    if (patience >= 7?) then (yes)
+      :Early stopping;
+      break
+    endif
+  else (no)
+  endif
+repeat while (epochs < 30)
+:Load best checkpoint;
+:Evaluate on test set;
+:Generate prediction masks
+(threshold=0.5, resize to original);
+:Save metrics & visual comparisons;
+stop
+@enduml

reports/figures/best_predictions.png ADDED Viewed

Git LFS Details

SHA256: 830a35e88e68175e2d9799e3628783c1e8771ea23f98dac69335041c7d5a3a3e
Pointer size: 132 Bytes
Size of remote file: 1.61 MB

reports/figures/failure_cases.png ADDED Viewed

Git LFS Details

SHA256: 721c8bef97164440065e3b1417f0ac62c2ed1cc116878bf377ad97ed6376c4a8
Pointer size: 132 Bytes
Size of remote file: 1.9 MB

reports/figures/readme_showcase.png ADDED Viewed

Git LFS Details

SHA256: 2fb9b5f5b350964e2391f03066987262d6455bcd6a6fadea210a9b24395f2ada
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

reports/figures/visual_comparison.png ADDED Viewed

Git LFS Details

SHA256: 5381e3793a0299aa1ed357d2d12381128f6e1adabfbf20b75bf1b64f519c6107
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

reports/report.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:048c3df1a28bd8b633df91f1fd161389d9dca37582be2cc3a8b6aa938fddec00
+size 4900758

reports/report.typ ADDED Viewed

	@@ -0,0 +1,251 @@

+#set document(title: "Prompted Segmentation for Drywall QA", author: "Karthik M Dani")
+#set page(paper: "a4", margin: (x: 1cm, y: 1cm), numbering: "1")
+#set text(font: "New Computer Modern", size: 11pt)
+#set heading(numbering: "1.0")
+#set par(justify: true)
+// Title page
+#align(center)[
+  #v(3cm)
+  #text(size: 24pt, weight: "bold")[Prompted Segmentation for Drywall QA]
+  #v(1cm)
+  #text(size: 14pt)[Text-Conditioned Binary Mask Prediction]
+  #v(0.5cm)
+  #text(size: 12pt, fill: gray)[CLIPSeg Fine-Tuning on Construction Datasets]
+  #v(2cm)
+  #text(size: 12pt)[Karthik M Dani]
+  #v(0.3cm)
+  #text(size: 11pt, fill: gray)[April 2026]
+  #v(3cm)
+]
+#pagebreak()
+// Table of contents
+#outline(indent: 1.5em)
+#pagebreak()
+= Goal Summary
+Given an image and a natural-language prompt, produce a binary segmentation mask for:
+- *"segment crack"* — identifying wall cracks
+- *"segment taping area"* — identifying drywall joint/tape regions
+The model must generalize across varied scenes and respond to text prompts at inference time, enabling flexible QA workflows.
+= Approach
+== Why CLIPSeg?
+We evaluated four text-conditioned segmentation architectures:
+#table(
+  columns: (1fr, 1fr, 1fr, 1fr, 1fr),
+  align: (left, center, center, center, center),
+  table.header(
+    [*Model*], [*Text Input*], [*Small Data*], [*Consumer GPU*], [*HF Support*],
+  ),
+  [CLIPSeg], [Direct], [Excellent], [Yes], [Native],
+  [Grounded SAM], [Via detector], [Moderate], [Decoder only], [Native],
+  [SEEM], [Multi-modal], [Difficult], [No], [GitHub],
+  [X-Decoder], [Yes], [Not ideal], [No], [Limited],
+)
+*CLIPSeg* was selected because:
++ Direct text-to-mask conditioning (no bounding box intermediate)
++ Only 1.13M trainable decoder parameters on frozen 149.6M CLIP backbone
++ Proven fine-tuning on small datasets (under 1,000 images)
++ Native HuggingFace `transformers` support
+== Architecture
+#figure(
+  image("diagrams/architecture.png", width: 90%),
+  caption: [CLIPSeg architecture: frozen CLIP backbone with trainable decoder],
+)
+The model takes an RGB image and a text prompt. The CLIP vision encoder (ViT-B/16) and text encoder produce embeddings. A lightweight 3-block transformer decoder with U-Net skip connections generates logits at 352×352, which are thresholded to produce binary masks.
+= Data
+== Sources
+#table(
+  columns: (1fr, 2fr, 1fr, 1fr),
+  align: (left, left, center, center),
+  table.header(
+    [*Dataset*], [*Source*], [*Images*], [*Annotation*],
+  ),
+  [Taping], [Roboflow: drywall-join-detect], [1,186], [Bounding boxes],
+  [Cracks], [Roboflow: cracks-3ii36], [5,369], [Segmentation polygons],
+)
+== Data Pipeline
+#figure(
+  image("diagrams/pipeline.png", width: 65%),
+  caption: [Data preparation pipeline from download to train/val/test splits],
+)
+- *Taping dataset*: Bounding box annotations converted to filled-rectangle binary masks
+- *Cracks dataset*: COCO polygon annotations rendered to pixel-accurate binary masks via `pycocotools`
+- Prompt augmentation: 5 synonyms per class, randomly sampled during training
+== Split Counts
+#table(
+  columns: (1fr, 1fr, 1fr, 1fr),
+  align: (left, center, center, center),
+  table.header(
+    [*Split*], [*Train*], [*Validation*], [*Test*],
+  ),
+  [Count], [4,588], [982], [985],
+  [Ratio], [70%], [15%], [15%],
+)
+Stratified by dataset class (taping vs. cracks), seed = 42.
+= Training
+== Pipeline
+#figure(
+  image("diagrams/Training Pipeline.png", width: 40%),
+  caption: [Training loop with early stopping],
+)
+== Hyperparameters
+#table(
+  columns: (1fr, 1fr),
+  align: (left, left),
+  table.header(
+    [*Parameter*], [*Value*],
+  ),
+  [Model], [`CIDAS/clipseg-rd64-refined`],
+  [Trainable params], [1,127,009 (decoder only)],
+  [Frozen params], [149,620,737 (CLIP backbone)],
+  [Optimizer], [AdamW (lr=1e-4, weight_decay=1e-4)],
+  [Scheduler], [CosineAnnealingLR],
+  [Loss], [0.5 × BCE + 0.5 × Dice],
+  [Batch size], [8],
+  [Max epochs], [30],
+  [Early stopping], [patience = 7 on val mIoU],
+  [Seed], [42],
+)
+== Training Results
+Training ran for 18 epochs before early stopping triggered (patience=7). Best validation mIoU achieved at epoch 11.
+#table(
+  columns: (1fr, 1fr, 1fr, 1fr, 1fr),
+  align: (center, center, center, center, center),
+  table.header(
+    [*Epoch*], [*Train Loss*], [*Val Loss*], [*Val mIoU*], [*Val Dice*],
+  ),
+  [1], [0.5512], [0.5339], [0.1186], [0.1895],
+  [4], [0.5196], [0.5213], [0.1539], [0.2312],
+  [8], [0.5113], [0.5135], [0.1543], [0.2300],
+  [*11*], [*0.5085*], [*0.5117*], [*0.1605*], [*0.2370*],
+  [14], [0.5056], [0.5077], [0.1501], [0.2237],
+  [18], [0.5033], [0.5068], [0.1531], [0.2273],
+)
+The model showed steady improvement in the first 11 epochs, with diminishing returns and eventual plateau thereafter. The loss landscape appears relatively flat in this region, which is expected given the frozen backbone and small decoder.
+= Evaluation Results
+== Metrics
+#table(
+  columns: (1fr, 1fr, 1fr, 1fr),
+  align: (left, center, center, center),
+  table.header(
+    [*Class*], [*mIoU*], [*Dice*], [*Test Samples*],
+  ),
+  [Taping], [0.1917], [0.2780], [179],
+  [Cracks], [0.1639], [0.2434], [806],
+  [*Overall*], [*0.1689*], [*0.2497*], [*985*],
+)
+Taping detection outperforms crack detection, likely because filled-rectangle masks provide a stronger supervision signal (larger contiguous regions) compared to thin crack annotations. The class imbalance in test samples (179 taping vs 806 cracks) reflects the original dataset sizes.
+== Visual Examples
+The best individual predictions reach IoU 0.78 for both cracks and taping, demonstrating that the model has learned meaningful text-conditioned segmentation despite the low aggregate scores. The gap between best-sample and mean performance is driven primarily by thin-crack samples where minor spatial offsets cause disproportionate IoU drops.
+#figure(
+  image("figures/best_predictions.png", width: 70%),
+  caption: [Best test-set predictions ranked by IoU: Original | Ground Truth | Model Prediction],
+)
+= Failure Cases & Potential Solutions
+== Worst Predictions
+The following examples show the model's worst test-set predictions (IoU near zero). These failures reveal systematic patterns that inform targeted improvements.
+#figure(
+  image("figures/failure_cases.png", width: 65%),
+  caption: [Failure cases — worst test-set predictions ranked by IoU (ascending)],
+)
+== Root Causes
+- *Taping annotations are coarse*: The source dataset provides bounding boxes, not pixel-level masks. Filled rectangles include substantial background, teaching the model to predict overly large regions.
+- *Cracks are thin structures*: Even small positional errors in crack predictions cause significant IoU drops. A 1-pixel-wide crack shifted by 2 pixels yields near-zero IoU despite visual similarity.
+- *Resolution bottleneck*: CLIPSeg operates at 352×352 fixed resolution. Fine crack details are lost during downscaling, particularly for high-resolution input images.
+- *Decoder capacity*: With only 1.13M trainable parameters, the decoder has limited capacity to learn domain-specific features for construction imagery.
+- *Domain gap*: The pretrained CLIP backbone was trained on internet images, not construction-specific content. The frozen backbone cannot adapt its feature extraction to this domain.
+== Potential Solutions
+#table(
+  columns: (1fr, 2fr),
+  align: (left, left),
+  table.header(
+    [*Limitation*], [*Proposed Solution*],
+  ),
+  [Coarse taping masks], [Use SAM or SAM2 to generate pixel-accurate masks from bounding boxes instead of filled rectangles],
+  [Frozen backbone domain gap], [Unfreeze the last 2--3 ViT blocks with a 10× lower learning rate for domain adaptation],
+  [352×352 resolution ceiling], [Switch to a higher-resolution architecture (e.g. SAM2 with text-prompt conditioning)],
+  [Small decoder], [Add more decoder blocks or increase hidden dimension while monitoring overfitting],
+  [Thin-crack IoU sensitivity], [Use boundary-aware metrics (e.g. boundary IoU) or distance-tolerant evaluation],
+)
+= Runtime & Footprint
+#table(
+  columns: (1fr, 1fr),
+  align: (left, left),
+  table.header(
+    [*Metric*], [*Value*],
+  ),
+  [Training time], [97.2 minutes (18 epochs)],
+  [Training device], [Apple M4 (MPS)],
+  [Training speed], [~2.1 iterations/second],
+  [Avg inference time], [58.7 ms/image],
+  [Model size (full)], [575.1 MB],
+  [Trainable parameters], [1.13M (decoder)],
+  [Total parameters], [150.7M],
+)
+= Audit Log
+#table(
+  columns: (1fr, 2fr),
+  align: (left, left),
+  table.header(
+    [*Step*], [*Details*],
+  ),
+  [Environment], [Python 3.11, PyTorch 2.11, transformers 5.5.3, uv],
+  [Datasets], [Roboflow Universe: drywall-join-detect (v1), cracks-3ii36 (raw)],
+  [Annotation handling], [Taping: bbox→rectangle masks; Cracks: polygon→binary masks],
+  [Model], [CLIPSeg (CIDAS/clipseg-rd64-refined), decoder-only fine-tuning],
+  [Loss], [BCEDiceLoss (0.5/0.5)],
+  [Device], [Apple M4, MPS backend],
+  [Diagrams], [diagrams (Python), d2, PlantUML],
+  [Report], [Typst],
+  [Seeds], [42 (data splits, torch, numpy)],
+)