youngPhilosopher commited on
Commit
451837b
·
verified ·
1 Parent(s): b891e61

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ reports/diagrams/architecture.png filter=lfs diff=lfs merge=lfs -text
37
+ reports/diagrams/pipeline.png filter=lfs diff=lfs merge=lfs -text
38
+ reports/diagrams/repo_structure.png filter=lfs diff=lfs merge=lfs -text
39
+ reports/figures/best_predictions.png filter=lfs diff=lfs merge=lfs -text
40
+ reports/figures/failure_cases.png filter=lfs diff=lfs merge=lfs -text
41
+ reports/figures/readme_showcase.png filter=lfs diff=lfs merge=lfs -text
42
+ reports/figures/visual_comparison.png filter=lfs diff=lfs merge=lfs -text
43
+ reports/report.pdf filter=lfs diff=lfs merge=lfs -text
reports/diagrams/Training Pipeline.png ADDED
reports/diagrams/architecture.png ADDED

Git LFS Details

  • SHA256: 4867045edbcc01f6c41f2d48ac81f8858dab680c3ba67603d2633764aa7f0ea1
  • Pointer size: 131 Bytes
  • Size of remote file: 189 kB
reports/diagrams/architecture.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate CLIPSeg architecture diagram using the diagrams library."""
2
+
3
+ from pathlib import Path
4
+ from diagrams import Diagram, Cluster, Edge
5
+ from diagrams.custom import Custom
6
+ from diagrams.generic.blank import Blank
7
+ from diagrams.programming.framework import React # placeholder icons
8
+ from diagrams.generic.compute import Rack
9
+ from diagrams.generic.storage import Storage
10
+ from diagrams.generic.database import SQL
11
+
12
+ OUTPUT_DIR = Path(__file__).resolve().parent
13
+
14
+ # Use generic shapes since diagrams lib is infrastructure-focused
15
+ # We'll create a clean conceptual diagram
16
+
17
+ with Diagram(
18
+ "CLIPSeg Architecture for Drywall QA",
19
+ filename=str(OUTPUT_DIR / "architecture"),
20
+ show=False,
21
+ direction="LR",
22
+ graph_attr={"fontsize": "16", "bgcolor": "white", "pad": "0.5"},
23
+ ):
24
+
25
+ with Cluster("Input"):
26
+ image_input = Rack("Image\n(RGB)")
27
+ text_input = Storage('Text Prompt\n"segment crack"')
28
+
29
+ with Cluster("CLIP Backbone (Frozen — 149.6M params)", graph_attr={"style": "dashed", "color": "#1565C0"}):
30
+ with Cluster("Vision Encoder"):
31
+ vis_enc = Rack("ViT-B/16\nPatch Embeddings\n+ Transformer")
32
+
33
+ with Cluster("Text Encoder"):
34
+ txt_enc = Storage("Token Embeddings\n+ Transformer")
35
+
36
+ with Cluster("Decoder (Trainable — 1.13M params)", graph_attr={"style": "filled", "color": "#C8E6C9"}):
37
+ decoder = SQL("3× Transformer\nDecoder Blocks\n(U-Net skip)")
38
+
39
+ with Cluster("Output"):
40
+ logits = Rack("Logits\n352×352")
41
+ mask = Storage("Binary Mask\n{0, 255}")
42
+
43
+ image_input >> Edge(label="pixel_values") >> vis_enc
44
+ text_input >> Edge(label="input_ids") >> txt_enc
45
+
46
+ vis_enc >> Edge(label="visual features") >> decoder
47
+ txt_enc >> Edge(label="text conditioning") >> decoder
48
+
49
+ decoder >> Edge(label="sigmoid + threshold") >> logits
50
+ logits >> mask
51
+
52
+
53
+ if __name__ == "__main__":
54
+ print(f"Architecture diagram saved to {OUTPUT_DIR / 'architecture.png'}")
reports/diagrams/pipeline.d2 ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: |md
2
+ # Data Pipeline
3
+ |
4
+
5
+ roboflow: Roboflow Universe {
6
+ shape: cloud
7
+ d1: Dataset 1\nDrywall-Join-Detect\n(1,186 images)
8
+ d2: Dataset 2\nCracks\n(5,369 images)
9
+ }
10
+
11
+ download: Download\n(COCO format) {
12
+ shape: step
13
+ }
14
+
15
+ inspect: Inspect\nAnnotations {
16
+ shape: diamond
17
+ }
18
+
19
+ bbox_mask: BBox → Filled\nRectangle Masks {
20
+ shape: step
21
+ style.fill: "#FFE0B2"
22
+ }
23
+
24
+ poly_mask: Polygon →\nBinary Masks {
25
+ shape: step
26
+ style.fill: "#C8E6C9"
27
+ }
28
+
29
+ unified: |md
30
+ Unified Manifest
31
+ (image, mask, prompts)
32
+ | {
33
+ shape: document
34
+ }
35
+
36
+ split: Stratified Split\n70/15/15 {
37
+ shape: step
38
+ }
39
+
40
+ train_set: Train\n4,588 {
41
+ shape: cylinder
42
+ style.fill: "#BBDEFB"
43
+ }
44
+
45
+ val_set: Val\n982 {
46
+ shape: cylinder
47
+ style.fill: "#BBDEFB"
48
+ }
49
+
50
+ test_set: Test\n985 {
51
+ shape: cylinder
52
+ style.fill: "#BBDEFB"
53
+ }
54
+
55
+ roboflow.d1 -> download
56
+ roboflow.d2 -> download
57
+ download -> inspect
58
+
59
+ inspect -> bbox_mask: bbox only\n(taping)
60
+ inspect -> poly_mask: segmentation\n(cracks)
61
+
62
+ bbox_mask -> unified
63
+ poly_mask -> unified
64
+
65
+ unified -> split
66
+
67
+ split -> train_set
68
+ split -> val_set
69
+ split -> test_set
reports/diagrams/pipeline.png ADDED

Git LFS Details

  • SHA256: bb77ea5a3c6980bf0f389ec2f4d49331ac6126f98307021ad2531aa002b1b1c9
  • Pointer size: 131 Bytes
  • Size of remote file: 311 kB
reports/diagrams/repo_structure.d2 ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ direction: right
2
+
3
+ configs: "configs/" {
4
+ shape: rectangle
5
+ style.fill: "#FFF3E0"
6
+ style.stroke: "#FB8C00"
7
+ style.border-radius: 6
8
+ style.font-size: 16
9
+ style.bold: true
10
+
11
+ yaml: "train_config.yaml\nAll hyperparameters" {
12
+ shape: page
13
+ style.fill: "#FFE0B2"
14
+ }
15
+ }
16
+
17
+ src: "src/" {
18
+ shape: rectangle
19
+ style.fill: "#E3F2FD"
20
+ style.stroke: "#1E88E5"
21
+ style.border-radius: 6
22
+ style.font-size: 16
23
+ style.bold: true
24
+
25
+ data: "data/" {
26
+ style.fill: "#BBDEFB"
27
+ style.border-radius: 4
28
+
29
+ p1: "preprocess.py\nMask rendering & splits" { shape: page; style.fill: "#E3F2FD" }
30
+ p2: "dataset.py\nPyTorch Dataset" { shape: page; style.fill: "#E3F2FD" }
31
+ }
32
+
33
+ model: "model/" {
34
+ style.fill: "#BBDEFB"
35
+ style.border-radius: 4
36
+
37
+ m1: "clipseg_wrapper.py\nModel loading & freezing" { shape: page; style.fill: "#E3F2FD" }
38
+ m2: "losses.py\nBCEDiceLoss" { shape: page; style.fill: "#E3F2FD" }
39
+ }
40
+
41
+ scripts: "Pipeline Scripts" {
42
+ style.fill: "#BBDEFB"
43
+ style.border-radius: 4
44
+
45
+ s1: "train.py" { shape: page; style.fill: "#E3F2FD" }
46
+ s2: "evaluate.py" { shape: page; style.fill: "#E3F2FD" }
47
+ s3: "predict.py" { shape: page; style.fill: "#E3F2FD" }
48
+ }
49
+ }
50
+
51
+ reports: "reports/" {
52
+ shape: rectangle
53
+ style.fill: "#E8F5E9"
54
+ style.stroke: "#43A047"
55
+ style.border-radius: 6
56
+ style.font-size: 16
57
+ style.bold: true
58
+
59
+ diag: "diagrams/" {
60
+ style.fill: "#C8E6C9"
61
+ style.border-radius: 4
62
+
63
+ d1: "architecture.py" { shape: page; style.fill: "#E8F5E9" }
64
+ d2_file: "pipeline.d2" { shape: page; style.fill: "#E8F5E9" }
65
+ d3: "training.puml" { shape: page; style.fill: "#E8F5E9" }
66
+ }
67
+
68
+ fig: "figures/" {
69
+ style.fill: "#C8E6C9"
70
+ style.border-radius: 4
71
+ }
72
+
73
+ pdf: "report.typ → PDF" {
74
+ shape: page
75
+ style.fill: "#A5D6A7"
76
+ style.font-size: 15
77
+ style.bold: true
78
+ }
79
+ }
80
+
81
+ outputs: "outputs/" {
82
+ shape: rectangle
83
+ style.fill: "#F3E5F5"
84
+ style.stroke: "#8E24AA"
85
+ style.border-radius: 6
86
+ style.font-size: 16
87
+ style.bold: true
88
+
89
+ ckpt: "checkpoints/\nbest_model.pt (575 MB)" {
90
+ shape: cylinder
91
+ style.fill: "#CE93D8"
92
+ }
93
+
94
+ masks: "masks/\n985 prediction PNGs" {
95
+ shape: stored_data
96
+ style.fill: "#CE93D8"
97
+ }
98
+
99
+ logs: "logs/\ntraining & test JSON" {
100
+ style.fill: "#E1BEE7"
101
+ style.border-radius: 4
102
+ }
103
+ }
104
+
105
+ data: "data/" {
106
+ shape: rectangle
107
+ style.fill: "#FFF9C4"
108
+ style.stroke: "#FDD835"
109
+ style.border-radius: 6
110
+ style.font-size: 16
111
+ style.bold: true
112
+
113
+ raw: "raw/\nCOCO datasets" {
114
+ style.fill: "#FFF59D"
115
+ style.border-radius: 4
116
+ }
117
+ processed: "processed/\nBinary masks" {
118
+ style.fill: "#FFF59D"
119
+ style.border-radius: 4
120
+ }
121
+ splits: "splits/\ntrain.json · val.json · test.json" {
122
+ style.fill: "#FFF59D"
123
+ style.border-radius: 4
124
+ }
125
+ }
126
+
127
+ # Flow connections
128
+ configs -> src: "configure" { style.stroke-dash: 5; style.stroke: "#9E9E9E" }
129
+ src -> outputs: "train & evaluate" { style.stroke: "#1E88E5" }
130
+ src -> reports: "generate" { style.stroke: "#43A047" }
131
+ data -> src: "feed" { style.stroke: "#FDD835" }
reports/diagrams/repo_structure.png ADDED

Git LFS Details

  • SHA256: 97279f1b9376b113362d22eb7ca2c8603c67443da429350079040b460ba37a04
  • Pointer size: 131 Bytes
  • Size of remote file: 588 kB
reports/diagrams/training.puml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @startuml Training Pipeline
2
+ !theme plain
3
+ skinparam backgroundColor #FFFFFF
4
+ skinparam activityBackgroundColor #E3F2FD
5
+ skinparam activityBorderColor #1565C0
6
+
7
+ title CLIPSeg Fine-Tuning Pipeline
8
+
9
+ start
10
+
11
+ :Load pretrained CLIPSeg
12
+ (CIDAS/clipseg-rd64-refined);
13
+
14
+ :Freeze CLIP backbone
15
+ (149.6M params frozen);
16
+
17
+ :Initialize decoder training
18
+ (1.13M params trainable);
19
+
20
+ :Configure optimizer
21
+ AdamW (lr=1e-4, wd=1e-4)
22
+ CosineAnnealingLR scheduler;
23
+
24
+ repeat
25
+ :Forward pass
26
+ image + text prompt → logits (352×352);
27
+
28
+ :Compute BCEDiceLoss
29
+ 0.5×BCE + 0.5×Dice;
30
+
31
+ :Backward pass + optimizer step;
32
+
33
+ if (End of epoch?) then (yes)
34
+ :Validate on val set
35
+ Compute mIoU & Dice;
36
+
37
+ if (val mIoU improved?) then (yes)
38
+ :Save best checkpoint;
39
+ :Reset patience counter;
40
+ else (no)
41
+ :Increment patience counter;
42
+ endif
43
+
44
+ if (patience >= 7?) then (yes)
45
+ :Early stopping;
46
+ break
47
+ endif
48
+ else (no)
49
+ endif
50
+
51
+ repeat while (epochs < 30)
52
+
53
+ :Load best checkpoint;
54
+
55
+ :Evaluate on test set;
56
+
57
+ :Generate prediction masks
58
+ (threshold=0.5, resize to original);
59
+
60
+ :Save metrics & visual comparisons;
61
+
62
+ stop
63
+
64
+ @enduml
reports/figures/best_predictions.png ADDED

Git LFS Details

  • SHA256: 830a35e88e68175e2d9799e3628783c1e8771ea23f98dac69335041c7d5a3a3e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.61 MB
reports/figures/failure_cases.png ADDED

Git LFS Details

  • SHA256: 721c8bef97164440065e3b1417f0ac62c2ed1cc116878bf377ad97ed6376c4a8
  • Pointer size: 132 Bytes
  • Size of remote file: 1.9 MB
reports/figures/readme_showcase.png ADDED

Git LFS Details

  • SHA256: 2fb9b5f5b350964e2391f03066987262d6455bcd6a6fadea210a9b24395f2ada
  • Pointer size: 132 Bytes
  • Size of remote file: 1.45 MB
reports/figures/visual_comparison.png ADDED

Git LFS Details

  • SHA256: 5381e3793a0299aa1ed357d2d12381128f6e1adabfbf20b75bf1b64f519c6107
  • Pointer size: 132 Bytes
  • Size of remote file: 1.24 MB
reports/report.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:048c3df1a28bd8b633df91f1fd161389d9dca37582be2cc3a8b6aa938fddec00
3
+ size 4900758
reports/report.typ ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #set document(title: "Prompted Segmentation for Drywall QA", author: "Karthik M Dani")
2
+ #set page(paper: "a4", margin: (x: 1cm, y: 1cm), numbering: "1")
3
+ #set text(font: "New Computer Modern", size: 11pt)
4
+ #set heading(numbering: "1.0")
5
+ #set par(justify: true)
6
+
7
+ // Title page
8
+ #align(center)[
9
+ #v(3cm)
10
+ #text(size: 24pt, weight: "bold")[Prompted Segmentation for Drywall QA]
11
+ #v(1cm)
12
+ #text(size: 14pt)[Text-Conditioned Binary Mask Prediction]
13
+ #v(0.5cm)
14
+ #text(size: 12pt, fill: gray)[CLIPSeg Fine-Tuning on Construction Datasets]
15
+ #v(2cm)
16
+ #text(size: 12pt)[Karthik M Dani]
17
+ #v(0.3cm)
18
+ #text(size: 11pt, fill: gray)[April 2026]
19
+ #v(3cm)
20
+ ]
21
+
22
+ #pagebreak()
23
+
24
+ // Table of contents
25
+ #outline(indent: 1.5em)
26
+ #pagebreak()
27
+
28
+ = Goal Summary
29
+
30
+ Given an image and a natural-language prompt, produce a binary segmentation mask for:
31
+ - *"segment crack"* — identifying wall cracks
32
+ - *"segment taping area"* — identifying drywall joint/tape regions
33
+
34
+ The model must generalize across varied scenes and respond to text prompts at inference time, enabling flexible QA workflows.
35
+
36
+ = Approach
37
+
38
+ == Why CLIPSeg?
39
+
40
+ We evaluated four text-conditioned segmentation architectures:
41
+
42
+ #table(
43
+ columns: (1fr, 1fr, 1fr, 1fr, 1fr),
44
+ align: (left, center, center, center, center),
45
+ table.header(
46
+ [*Model*], [*Text Input*], [*Small Data*], [*Consumer GPU*], [*HF Support*],
47
+ ),
48
+ [CLIPSeg], [Direct], [Excellent], [Yes], [Native],
49
+ [Grounded SAM], [Via detector], [Moderate], [Decoder only], [Native],
50
+ [SEEM], [Multi-modal], [Difficult], [No], [GitHub],
51
+ [X-Decoder], [Yes], [Not ideal], [No], [Limited],
52
+ )
53
+
54
+ *CLIPSeg* was selected because:
55
+ + Direct text-to-mask conditioning (no bounding box intermediate)
56
+ + Only 1.13M trainable decoder parameters on frozen 149.6M CLIP backbone
57
+ + Proven fine-tuning on small datasets (under 1,000 images)
58
+ + Native HuggingFace `transformers` support
59
+
60
+ == Architecture
61
+
62
+ #figure(
63
+ image("diagrams/architecture.png", width: 90%),
64
+ caption: [CLIPSeg architecture: frozen CLIP backbone with trainable decoder],
65
+ )
66
+
67
+ The model takes an RGB image and a text prompt. The CLIP vision encoder (ViT-B/16) and text encoder produce embeddings. A lightweight 3-block transformer decoder with U-Net skip connections generates logits at 352×352, which are thresholded to produce binary masks.
68
+
69
+ = Data
70
+
71
+ == Sources
72
+
73
+ #table(
74
+ columns: (1fr, 2fr, 1fr, 1fr),
75
+ align: (left, left, center, center),
76
+ table.header(
77
+ [*Dataset*], [*Source*], [*Images*], [*Annotation*],
78
+ ),
79
+ [Taping], [Roboflow: drywall-join-detect], [1,186], [Bounding boxes],
80
+ [Cracks], [Roboflow: cracks-3ii36], [5,369], [Segmentation polygons],
81
+ )
82
+
83
+ == Data Pipeline
84
+
85
+ #figure(
86
+ image("diagrams/pipeline.png", width: 65%),
87
+ caption: [Data preparation pipeline from download to train/val/test splits],
88
+ )
89
+
90
+ - *Taping dataset*: Bounding box annotations converted to filled-rectangle binary masks
91
+ - *Cracks dataset*: COCO polygon annotations rendered to pixel-accurate binary masks via `pycocotools`
92
+ - Prompt augmentation: 5 synonyms per class, randomly sampled during training
93
+
94
+ == Split Counts
95
+
96
+ #table(
97
+ columns: (1fr, 1fr, 1fr, 1fr),
98
+ align: (left, center, center, center),
99
+ table.header(
100
+ [*Split*], [*Train*], [*Validation*], [*Test*],
101
+ ),
102
+ [Count], [4,588], [982], [985],
103
+ [Ratio], [70%], [15%], [15%],
104
+ )
105
+
106
+ Stratified by dataset class (taping vs. cracks), seed = 42.
107
+
108
+ = Training
109
+
110
+ == Pipeline
111
+
112
+ #figure(
113
+ image("diagrams/Training Pipeline.png", width: 40%),
114
+ caption: [Training loop with early stopping],
115
+ )
116
+
117
+ == Hyperparameters
118
+
119
+ #table(
120
+ columns: (1fr, 1fr),
121
+ align: (left, left),
122
+ table.header(
123
+ [*Parameter*], [*Value*],
124
+ ),
125
+ [Model], [`CIDAS/clipseg-rd64-refined`],
126
+ [Trainable params], [1,127,009 (decoder only)],
127
+ [Frozen params], [149,620,737 (CLIP backbone)],
128
+ [Optimizer], [AdamW (lr=1e-4, weight_decay=1e-4)],
129
+ [Scheduler], [CosineAnnealingLR],
130
+ [Loss], [0.5 × BCE + 0.5 × Dice],
131
+ [Batch size], [8],
132
+ [Max epochs], [30],
133
+ [Early stopping], [patience = 7 on val mIoU],
134
+ [Seed], [42],
135
+ )
136
+
137
+ == Training Results
138
+
139
+ Training ran for 18 epochs before early stopping triggered (patience=7). Best validation mIoU achieved at epoch 11.
140
+
141
+ #table(
142
+ columns: (1fr, 1fr, 1fr, 1fr, 1fr),
143
+ align: (center, center, center, center, center),
144
+ table.header(
145
+ [*Epoch*], [*Train Loss*], [*Val Loss*], [*Val mIoU*], [*Val Dice*],
146
+ ),
147
+ [1], [0.5512], [0.5339], [0.1186], [0.1895],
148
+ [4], [0.5196], [0.5213], [0.1539], [0.2312],
149
+ [8], [0.5113], [0.5135], [0.1543], [0.2300],
150
+ [*11*], [*0.5085*], [*0.5117*], [*0.1605*], [*0.2370*],
151
+ [14], [0.5056], [0.5077], [0.1501], [0.2237],
152
+ [18], [0.5033], [0.5068], [0.1531], [0.2273],
153
+ )
154
+
155
+ The model showed steady improvement in the first 11 epochs, with diminishing returns and eventual plateau thereafter. The loss landscape appears relatively flat in this region, which is expected given the frozen backbone and small decoder.
156
+
157
+ = Evaluation Results
158
+
159
+ == Metrics
160
+
161
+ #table(
162
+ columns: (1fr, 1fr, 1fr, 1fr),
163
+ align: (left, center, center, center),
164
+ table.header(
165
+ [*Class*], [*mIoU*], [*Dice*], [*Test Samples*],
166
+ ),
167
+ [Taping], [0.1917], [0.2780], [179],
168
+ [Cracks], [0.1639], [0.2434], [806],
169
+ [*Overall*], [*0.1689*], [*0.2497*], [*985*],
170
+ )
171
+
172
+ Taping detection outperforms crack detection, likely because filled-rectangle masks provide a stronger supervision signal (larger contiguous regions) compared to thin crack annotations. The class imbalance in test samples (179 taping vs 806 cracks) reflects the original dataset sizes.
173
+
174
+ == Visual Examples
175
+
176
+ The best individual predictions reach IoU 0.78 for both cracks and taping, demonstrating that the model has learned meaningful text-conditioned segmentation despite the low aggregate scores. The gap between best-sample and mean performance is driven primarily by thin-crack samples where minor spatial offsets cause disproportionate IoU drops.
177
+
178
+ #figure(
179
+ image("figures/best_predictions.png", width: 70%),
180
+ caption: [Best test-set predictions ranked by IoU: Original | Ground Truth | Model Prediction],
181
+ )
182
+
183
+ = Failure Cases & Potential Solutions
184
+
185
+ == Worst Predictions
186
+
187
+ The following examples show the model's worst test-set predictions (IoU near zero). These failures reveal systematic patterns that inform targeted improvements.
188
+
189
+ #figure(
190
+ image("figures/failure_cases.png", width: 65%),
191
+ caption: [Failure cases — worst test-set predictions ranked by IoU (ascending)],
192
+ )
193
+
194
+ == Root Causes
195
+
196
+ - *Taping annotations are coarse*: The source dataset provides bounding boxes, not pixel-level masks. Filled rectangles include substantial background, teaching the model to predict overly large regions.
197
+ - *Cracks are thin structures*: Even small positional errors in crack predictions cause significant IoU drops. A 1-pixel-wide crack shifted by 2 pixels yields near-zero IoU despite visual similarity.
198
+ - *Resolution bottleneck*: CLIPSeg operates at 352×352 fixed resolution. Fine crack details are lost during downscaling, particularly for high-resolution input images.
199
+ - *Decoder capacity*: With only 1.13M trainable parameters, the decoder has limited capacity to learn domain-specific features for construction imagery.
200
+ - *Domain gap*: The pretrained CLIP backbone was trained on internet images, not construction-specific content. The frozen backbone cannot adapt its feature extraction to this domain.
201
+
202
+ == Potential Solutions
203
+
204
+ #table(
205
+ columns: (1fr, 2fr),
206
+ align: (left, left),
207
+ table.header(
208
+ [*Limitation*], [*Proposed Solution*],
209
+ ),
210
+ [Coarse taping masks], [Use SAM or SAM2 to generate pixel-accurate masks from bounding boxes instead of filled rectangles],
211
+ [Frozen backbone domain gap], [Unfreeze the last 2--3 ViT blocks with a 10× lower learning rate for domain adaptation],
212
+ [352×352 resolution ceiling], [Switch to a higher-resolution architecture (e.g. SAM2 with text-prompt conditioning)],
213
+ [Small decoder], [Add more decoder blocks or increase hidden dimension while monitoring overfitting],
214
+ [Thin-crack IoU sensitivity], [Use boundary-aware metrics (e.g. boundary IoU) or distance-tolerant evaluation],
215
+ )
216
+
217
+ = Runtime & Footprint
218
+
219
+ #table(
220
+ columns: (1fr, 1fr),
221
+ align: (left, left),
222
+ table.header(
223
+ [*Metric*], [*Value*],
224
+ ),
225
+ [Training time], [97.2 minutes (18 epochs)],
226
+ [Training device], [Apple M4 (MPS)],
227
+ [Training speed], [~2.1 iterations/second],
228
+ [Avg inference time], [58.7 ms/image],
229
+ [Model size (full)], [575.1 MB],
230
+ [Trainable parameters], [1.13M (decoder)],
231
+ [Total parameters], [150.7M],
232
+ )
233
+
234
+ = Audit Log
235
+
236
+ #table(
237
+ columns: (1fr, 2fr),
238
+ align: (left, left),
239
+ table.header(
240
+ [*Step*], [*Details*],
241
+ ),
242
+ [Environment], [Python 3.11, PyTorch 2.11, transformers 5.5.3, uv],
243
+ [Datasets], [Roboflow Universe: drywall-join-detect (v1), cracks-3ii36 (raw)],
244
+ [Annotation handling], [Taping: bbox→rectangle masks; Cracks: polygon→binary masks],
245
+ [Model], [CLIPSeg (CIDAS/clipseg-rd64-refined), decoder-only fine-tuning],
246
+ [Loss], [BCEDiceLoss (0.5/0.5)],
247
+ [Device], [Apple M4, MPS backend],
248
+ [Diagrams], [diagrams (Python), d2, PlantUML],
249
+ [Report], [Typst],
250
+ [Seeds], [42 (data splits, torch, numpy)],
251
+ )