Upload fine-tuned CLIPSeg weights from best_model.pt
Browse files- README.md +84 -0
- config.json +35 -0
- merges.txt +0 -0
- model.safetensors +3 -0
- preprocessor_config.json +23 -0
- special_tokens_map.json +24 -0
- tokenizer.json +0 -0
- tokenizer_config.json +31 -0
- vocab.json +0 -0
README.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
license: mit
|
| 4 |
+
tags:
|
| 5 |
+
- clipseg
|
| 6 |
+
- image-segmentation
|
| 7 |
+
- text-conditioned-segmentation
|
| 8 |
+
- drywall
|
| 9 |
+
- quality-inspection
|
| 10 |
+
- pytorch
|
| 11 |
+
base_model: CIDAS/clipseg-rd64-refined
|
| 12 |
+
datasets:
|
| 13 |
+
- roboflow/drywall-join-detect
|
| 14 |
+
- roboflow/cracks-3ii36
|
| 15 |
+
metrics:
|
| 16 |
+
- iou
|
| 17 |
+
- dice
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
# CLIPSeg — Fine-tuned for Drywall QA
|
| 21 |
+
|
| 22 |
+
Fine-tuned version of [CIDAS/clipseg-rd64-refined](https://huggingface.co/CIDAS/clipseg-rd64-refined)
|
| 23 |
+
for text-conditioned binary segmentation of drywall defects.
|
| 24 |
+
|
| 25 |
+
## Supported Prompts
|
| 26 |
+
|
| 27 |
+
| Prompt | Target Region | Val mIoU | Val Dice |
|
| 28 |
+
|--------|--------------|----------|----------|
|
| 29 |
+
| `segment crack` | Wall cracks | **0.7352** | **0.8336** |
|
| 30 |
+
| `segment taping area` | Joint / tape seam | **0.4985** | **0.6256** |
|
| 31 |
+
|
| 32 |
+
## Training Details
|
| 33 |
+
|
| 34 |
+
| Setting | Value |
|
| 35 |
+
|---------|-------|
|
| 36 |
+
| Base model | `CIDAS/clipseg-rd64-refined` |
|
| 37 |
+
| Epochs | 20 |
|
| 38 |
+
| Batch size | 4 |
|
| 39 |
+
| Learning rate | 1e-4 (AdamW) |
|
| 40 |
+
| Scheduler | CosineAnnealingLR |
|
| 41 |
+
| Loss | BCE 0.5 + Dice 0.5 |
|
| 42 |
+
| Image size | 352 × 352 |
|
| 43 |
+
| Threshold | 0.5 |
|
| 44 |
+
| Seed | 42 |
|
| 45 |
+
| Hardware | Tesla T4 (Google Colab) |
|
| 46 |
+
| Train time | ~65.3 min |
|
| 47 |
+
| Avg inference | 13.0 ms / image |
|
| 48 |
+
|
| 49 |
+
## Datasets
|
| 50 |
+
|
| 51 |
+
- **Dataset 1 — Taping area:** [Drywall-Join-Detect](https://universe.roboflow.com/objectdetect-pu6rn/drywall-join-detect)
|
| 52 |
+
- **Dataset 2 — Cracks:** [Cracks](https://universe.roboflow.com/fyp-ny1jt/cracks-3ii36)
|
| 53 |
+
|
| 54 |
+
## Quick Usage
|
| 55 |
+
|
| 56 |
+
```python
|
| 57 |
+
import torch
|
| 58 |
+
from PIL import Image
|
| 59 |
+
from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
|
| 60 |
+
|
| 61 |
+
processor = CLIPSegProcessor.from_pretrained("S-4-G-4-R/clipseg-drywall-qa")
|
| 62 |
+
model = CLIPSegForImageSegmentation.from_pretrained("S-4-G-4-R/clipseg-drywall-qa")
|
| 63 |
+
model.eval()
|
| 64 |
+
|
| 65 |
+
image = Image.open("your_image.jpg").convert("RGB")
|
| 66 |
+
prompt = "segment crack" # or "segment taping area"
|
| 67 |
+
|
| 68 |
+
inputs = processor(
|
| 69 |
+
text=prompt, images=image,
|
| 70 |
+
return_tensors="pt", padding=True
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
with torch.no_grad():
|
| 74 |
+
logits = model(**inputs).logits
|
| 75 |
+
|
| 76 |
+
mask = (torch.sigmoid(logits[0]) > 0.5).numpy() # boolean H×W mask
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## Test Results (best checkpoint — epoch 15)
|
| 80 |
+
|
| 81 |
+
| Metric | segment crack | segment taping area |
|
| 82 |
+
|--------|--------------|---------------------|
|
| 83 |
+
| mIoU | 0.6900 (test) / 0.7352 (val) | 0.4985 (val) |
|
| 84 |
+
| Dice | 0.7957 (test) / 0.8336 (val) | 0.6256 (val) |
|
config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "CIDAS/clipseg-rd64-refined",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CLIPSegForImageSegmentation"
|
| 5 |
+
],
|
| 6 |
+
"conditional_layer": 0,
|
| 7 |
+
"decoder_attention_dropout": 0.0,
|
| 8 |
+
"decoder_hidden_act": "quick_gelu",
|
| 9 |
+
"decoder_intermediate_size": 2048,
|
| 10 |
+
"decoder_num_attention_heads": 4,
|
| 11 |
+
"extract_layers": [
|
| 12 |
+
3,
|
| 13 |
+
6,
|
| 14 |
+
9
|
| 15 |
+
],
|
| 16 |
+
"initializer_factor": 1.0,
|
| 17 |
+
"logit_scale_init_value": 2.6592,
|
| 18 |
+
"model_type": "clipseg",
|
| 19 |
+
"projection_dim": 512,
|
| 20 |
+
"reduce_dim": 64,
|
| 21 |
+
"text_config": {
|
| 22 |
+
"bos_token_id": 0,
|
| 23 |
+
"dropout": 0.0,
|
| 24 |
+
"eos_token_id": 2,
|
| 25 |
+
"model_type": "clipseg_text_model"
|
| 26 |
+
},
|
| 27 |
+
"torch_dtype": "float32",
|
| 28 |
+
"transformers_version": "4.46.0",
|
| 29 |
+
"use_complex_transposed_convolution": true,
|
| 30 |
+
"vision_config": {
|
| 31 |
+
"dropout": 0.0,
|
| 32 |
+
"model_type": "clipseg_vision_model",
|
| 33 |
+
"patch_size": 16
|
| 34 |
+
}
|
| 35 |
+
}
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92019ed53bd73b145328a99a2ff12c0c6ae8f70174bac86774704183b0c05c68
|
| 3 |
+
size 603047096
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"do_rescale": true,
|
| 4 |
+
"do_resize": true,
|
| 5 |
+
"image_mean": [
|
| 6 |
+
0.485,
|
| 7 |
+
0.456,
|
| 8 |
+
0.406
|
| 9 |
+
],
|
| 10 |
+
"image_processor_type": "ViTImageProcessor",
|
| 11 |
+
"image_std": [
|
| 12 |
+
0.229,
|
| 13 |
+
0.224,
|
| 14 |
+
0.225
|
| 15 |
+
],
|
| 16 |
+
"processor_class": "CLIPSegProcessor",
|
| 17 |
+
"resample": 2,
|
| 18 |
+
"rescale_factor": 0.00392156862745098,
|
| 19 |
+
"size": {
|
| 20 |
+
"height": 352,
|
| 21 |
+
"width": 352
|
| 22 |
+
}
|
| 23 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<|startoftext|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<|endoftext|>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<|endoftext|>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<|endoftext|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": true,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"49406": {
|
| 5 |
+
"content": "<|startoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"49407": {
|
| 13 |
+
"content": "<|endoftext|>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": true,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
"bos_token": "<|startoftext|>",
|
| 22 |
+
"clean_up_tokenization_spaces": false,
|
| 23 |
+
"do_lower_case": true,
|
| 24 |
+
"eos_token": "<|endoftext|>",
|
| 25 |
+
"errors": "replace",
|
| 26 |
+
"model_max_length": 77,
|
| 27 |
+
"pad_token": "<|endoftext|>",
|
| 28 |
+
"processor_class": "CLIPSegProcessor",
|
| 29 |
+
"tokenizer_class": "CLIPTokenizer",
|
| 30 |
+
"unk_token": "<|endoftext|>"
|
| 31 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|