Upload folder using huggingface_hub
Browse files- .gitattributes +0 -34
- .gitignore +28 -0
- README(zh-tw).md +176 -0
- README.md +176 -3
- dinov3_vitsplus_efficient/README(zh-tw).md +111 -0
- dinov3_vitsplus_efficient/README.md +111 -0
- dinov3_vitsplus_efficient/best_model_20260306_233824.pt +3 -0
- dinov3_vitsplus_efficient/dinov3_vitsplus_tune_02_p2lr5_4ph.yaml +75 -0
- dinov3_vitsplus_efficient/final_model_20260306_233824.pt +3 -0
- e3_01b_dinov2_vitb_best/README(zh-tw).md +104 -0
- e3_01b_dinov2_vitb_best/README.md +104 -0
- e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt +3 -0
- e3_01b_dinov2_vitb_best/e3_01b_same_area_neg_075.yaml +74 -0
- e3_01b_dinov2_vitb_best/final_model_20260308_110634.pt +3 -0
- extract_features.py +486 -0
- legacy/dinov2_coral_best_model_20251015_165008.pt +3 -0
- legacy/dinov2_coral_best_model_20251016_133229.pt +3 -0
- pyproject.toml +19 -0
- uv.lock +0 -0
.gitattributes
CHANGED
|
@@ -1,35 +1 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.egg-info/
|
| 5 |
+
|
| 6 |
+
# Virtual environments
|
| 7 |
+
.venv/
|
| 8 |
+
|
| 9 |
+
# Benchmark data and scripts (not for HF)
|
| 10 |
+
2022sample/
|
| 11 |
+
2023sample/
|
| 12 |
+
benchmark.py
|
| 13 |
+
embed_config.py
|
| 14 |
+
results_*.json
|
| 15 |
+
|
| 16 |
+
# Extracted features
|
| 17 |
+
features/
|
| 18 |
+
*.h5
|
| 19 |
+
|
| 20 |
+
# IDE
|
| 21 |
+
.idea/
|
| 22 |
+
.vscode/
|
| 23 |
+
*.swp
|
| 24 |
+
*.swo
|
| 25 |
+
|
| 26 |
+
# OS
|
| 27 |
+
.DS_Store
|
| 28 |
+
Thumbs.db
|
README(zh-tw).md
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: gpl-3.0
|
| 3 |
+
tags:
|
| 4 |
+
- coral-reef
|
| 5 |
+
- re-identification
|
| 6 |
+
- metric-learning
|
| 7 |
+
- dinov2
|
| 8 |
+
- dinov3
|
| 9 |
+
- pytorch
|
| 10 |
+
datasets:
|
| 11 |
+
- custom
|
| 12 |
+
pipeline_tag: image-feature-extraction
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# 珊瑚個體辨識模型
|
| 16 |
+
|
| 17 |
+
針對水下珊瑚個體跨年辨識的微調模型。
|
| 18 |
+
|
| 19 |
+
本專案包含兩個最佳模型與獨立推論腳本,無需依賴訓練程式碼(`coral_reid`)即可運作。
|
| 20 |
+
|
| 21 |
+
> 原始碼:[GitHub](https://github.com/YuC13600/coral_models)
|
| 22 |
+
|
| 23 |
+
## 最佳模型
|
| 24 |
+
|
| 25 |
+
### 最高精度 — E3-01b DINOv2 ViT-B/14
|
| 26 |
+
|
| 27 |
+
| | |
|
| 28 |
+
|---|---|
|
| 29 |
+
| **N-Benchmark Top-1** | **86.6%** (110/127) |
|
| 30 |
+
| Top-3 / Top-5 / Top-10 | 96.9% / 97.6% / 100.0% |
|
| 31 |
+
| 平均排名 / 最差排名 | 1.30 / 9 |
|
| 32 |
+
| Backbone | DINOv2 ViT-B/14 (86.6M 參數, timm 518×518) |
|
| 33 |
+
| 損失函數 | Triplet (margin=0.3) + Hard Mining |
|
| 34 |
+
| 取樣器 | AreaAwareSampler (area_ratio=0.75) |
|
| 35 |
+
| 訓練 | 4 階段漸進式解凍,56 epochs,約 7.2 小時 |
|
| 36 |
+
| 嵌入維度 | 1280-d,L2 正規化 |
|
| 37 |
+
| 檔案 | `e3_01b_dinov2_vitb_best/` |
|
| 38 |
+
|
| 39 |
+
### 最高效率 — DINOv3 ViT-S+/16
|
| 40 |
+
|
| 41 |
+
| | |
|
| 42 |
+
|---|---|
|
| 43 |
+
| **N-Benchmark Top-1** | **81.1%** (103/127) |
|
| 44 |
+
| Top-3 / Top-5 / Top-10 | 92.1% / 95.3% / 99.2% |
|
| 45 |
+
| 平均排名 | 1.61 |
|
| 46 |
+
| Backbone | DINOv3 ViT-S+/16 (~22M 參數, timm 512×512) |
|
| 47 |
+
| 損失函數 | Triplet (margin=0.3) + Hard Mining |
|
| 48 |
+
| 取樣器 | MPerClassSampler (m=2) |
|
| 49 |
+
| 訓練 | 4 階段漸進式解凍,63 epochs,約 2.0 小時 |
|
| 50 |
+
| 嵌入維度 | 768-d,L2 正規化 |
|
| 51 |
+
| 檔案 | `dinov3_vitsplus_efficient/` |
|
| 52 |
+
|
| 53 |
+
### 模型比較
|
| 54 |
+
|
| 55 |
+
| 指標 | 最高精度 | 最高效率 | 差距 |
|
| 56 |
+
|------|---------|---------|------|
|
| 57 |
+
| Top-1 | 86.6% | 81.1% | -5.5% |
|
| 58 |
+
| 參數量 | ~86.6M | ~22M | **-75%** |
|
| 59 |
+
| 模型大小 | 339 MB | 112 MB | **-67%** |
|
| 60 |
+
| 訓練時間 | ~7.2h | ~2.0h | **-72%** |
|
| 61 |
+
| 推論 tokens | 1369 (patch14) | 1024 (patch16) | -25% |
|
| 62 |
+
|
| 63 |
+
## 快速開始
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
# 安裝依賴(獨立環境,不需要 coral_reid)
|
| 67 |
+
uv sync
|
| 68 |
+
|
| 69 |
+
# 提取單張圖片特徵
|
| 70 |
+
uv run python extract_features.py \
|
| 71 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 72 |
+
--input /path/to/image.jpg
|
| 73 |
+
|
| 74 |
+
# 提取整個目錄的特徵
|
| 75 |
+
uv run python extract_features.py \
|
| 76 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 77 |
+
--input /path/to/images/ \
|
| 78 |
+
--output features.h5
|
| 79 |
+
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## N-Benchmark 各區域結果
|
| 83 |
+
|
| 84 |
+
### E3-01b DINOv2 ViT-B/14(最佳)
|
| 85 |
+
|
| 86 |
+
| 區域 | 查詢數 | Top-1 | Top-3 | Top-5 | 平均排名 |
|
| 87 |
+
|------|--------|-------|-------|-------|----------|
|
| 88 |
+
| 37 | 32 | 93.8% | 96.9% | 96.9% | 1.28 |
|
| 89 |
+
| 38 | 31 | 80.6% | 100.0% | 100.0% | 1.19 |
|
| 90 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.44 |
|
| 91 |
+
| 40 | 37 | 86.5% | 97.3% | 97.3% | 1.30 |
|
| 92 |
+
| **整體** | **127** | **86.6%** | **96.9%** | **97.6%** | **1.30** |
|
| 93 |
+
|
| 94 |
+
### DINOv3 ViT-S+/16(高效率)
|
| 95 |
+
|
| 96 |
+
| 區域 | 查詢數 | Top-1 | Top-3 | Top-5 | 平均排名 |
|
| 97 |
+
|------|--------|-------|-------|-------|----------|
|
| 98 |
+
| 37 | 32 | 81.2% | 93.8% | 96.9% | 1.56 |
|
| 99 |
+
| 38 | 31 | 77.4% | 90.3% | 93.5% | 1.90 |
|
| 100 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.37 |
|
| 101 |
+
| 40 | 37 | 81.1% | 91.9% | 94.6% | 1.57 |
|
| 102 |
+
| **整體** | **127** | **81.1%** | **92.1%** | **95.3%** | **1.61** |
|
| 103 |
+
|
| 104 |
+
## 完整模型歷史
|
| 105 |
+
|
| 106 |
+
### 模型比較表
|
| 107 |
+
|
| 108 |
+
| 模型名稱 | 架構 | Backbone | 損失函數 | 挖掘方式 | 同區域負樣本 | 圖片 | 測試準確率 | 測試損失 | 驗證損失 | N-Bench 平均 | A37 | A38 | A39 | A40 | 時間 |
|
| 109 |
+
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
| 110 |
+
| 預訓練 | - | DINOv2-B/14 | - | - | - | bbox | - | - | - | 29.48% | 28.12% | 35.48% | 29.63% | 24.32% | - |
|
| 111 |
+
| 20250812_152526 | 舊 | DINOv2-B/14 | Triplet | 預組三元組 | ❌ | bbox | 92.6% | 0.1659 | - | 48.25% | 50.00% | 51.61% | 48.15% | 43.24% | ~16h |
|
| 112 |
+
| 20251007_133126 | 舊 | DINOv2-B/14 | Triplet | 預組三元組 | ✅ | bbox | 88.8% | 0.2523 | - | 39.32% | 46.88% | 41.94% | 33.33% | 35.14% | ~16h |
|
| 113 |
+
| 20251008_094017 | 舊 | DINOv2-B/14 | Triplet | 預組三元組 | ✅ | bbox | 90.4% | 0.1636 | - | 40.19% | 37.50% | 48.39% | 37.04% | 37.84% | ~16h |
|
| 114 |
+
| 20251014_183603 | 舊 | DINOv2-B/14 | Triplet | 預組三元組 | ❌ | bbox | 92.8% | 0.1012 | - | 40.97% | 37.50% | 38.71% | 44.44% | 43.24% | ~16h |
|
| 115 |
+
| 預訓練 | - | DINOv2-B/14 | - | - | - | whole | - | - | - | 50.88% | 34.38% | 54.84% | 62.96% | 51.35% | - |
|
| 116 |
+
| 20251015_165008 | 舊 | DINOv2-B/14 | Triplet | 預組三元組 | ✅ | whole | 92.7% | 0.1330 | 0.1006 | 64.43% | 62.50% | 61.29% | 55.56% | 78.38% | ~16h |
|
| 117 |
+
| 20251016_133229 | 舊 | DINOv2-B/14 | Triplet | 預組三元組 | ❌ | whole | 97.9% | 0.0429 | - | 63.31% | 56.25% | 58.06% | 74.07% | 64.86% | ~16h |
|
| 118 |
+
| **20260308_110634** | **新** | **DINOv2-B/14 (timm 518)** | **Triplet** | **動態 (PML)** | **AreaAware 0.75** | whole | - | - | **0.1604** | **86.6%** | **93.8%** | **80.6%** | **85.2%** | **86.5%** | **~7.2h** |
|
| 119 |
+
| **20260306_233824** | **新** | **DINOv3-S+/16 (timm 512)** | **Triplet** | **動態 (PML)** | ❌ | whole | - | - | **0.1604** | **81.1%** | **81.2%** | **77.4%** | **85.2%** | **81.1%** | **~2.0h** |
|
| 120 |
+
|
| 121 |
+
### 欄位說明
|
| 122 |
+
|
| 123 |
+
| 欄位 | 說明 |
|
| 124 |
+
| --- | --- |
|
| 125 |
+
| 架構 | `舊` = 舊專案實作,`新` = 重構後的模組化架構 |
|
| 126 |
+
| Backbone | 特徵提取器(DINOv2-B/14、DINOv3-S+/16 等) |
|
| 127 |
+
| 損失函數 | 損失函數類型(Triplet、ArcFace、CosFace、Circle、Contrastive 等) |
|
| 128 |
+
| 挖掘方式 | 樣本挖掘策略:`預組三元組` = 固定三元組,`動態 (PML)` = MPerClassSampler |
|
| 129 |
+
| 同區域負樣本 | 是否限制負樣本來自同一地理區域(`AreaAware 0.75` = 75% 同區域) |
|
| 130 |
+
| 圖片 | `bbox` = EXIF 邊界框裁切,`whole` = 完整圖片 |
|
| 131 |
+
| 測試準確率 | 測試集準確率(僅舊架構,衡量 pos_dist < neg_dist) |
|
| 132 |
+
| 驗證損失 | 訓練期間最佳驗證損失 |
|
| 133 |
+
| N-Bench 平均 | N-Benchmark Top-1 準確率(區域 37-40 平均) |
|
| 134 |
+
|
| 135 |
+
### 架構差異
|
| 136 |
+
|
| 137 |
+
| 特性 | 舊架構 | 新架構 |
|
| 138 |
+
| --- | --- | --- |
|
| 139 |
+
| Dataset 輸出 | `(anchor, pos, neg)` - 3 張圖片 | `(image, label)` - 1 張圖片 |
|
| 140 |
+
| 三元組形成 | 訓練前預先組成 | 每批次動態挖掘 |
|
| 141 |
+
| 批次取樣器 | 隨機 | MPerClassSampler (m=2) |
|
| 142 |
+
| 損失函數 | 自訂 TripletLossWithMining | PML TripletMarginLoss |
|
| 143 |
+
| 每 Epoch 樣本數 | ~50,000 三元組 × 3 張圖片 | ~4,000 張圖片 |
|
| 144 |
+
| 訓練速度 | ~23 分鐘/epoch | ~1.5 分鐘/epoch |
|
| 145 |
+
| 同區域負樣本 | 已實作 | 已實作(AreaAwareSampler) |
|
| 146 |
+
|
| 147 |
+
> **N-Benchmark(最近鄰基準測試)**:在區域 37-40 中,跨 2022 與 2023 年比對珊瑚標本時,正確辨識的 Top-1 準確率。
|
| 148 |
+
|
| 149 |
+
## 專案結構
|
| 150 |
+
|
| 151 |
+
```
|
| 152 |
+
coral_models/
|
| 153 |
+
├── pyproject.toml # uv 環境(獨立)
|
| 154 |
+
├── extract_features.py # 特徵提取腳本
|
| 155 |
+
├── e3_01b_dinov2_vitb_best/ # 最高精度模型 (86.6%)
|
| 156 |
+
│ ├── best_model_20260308_110634.pt
|
| 157 |
+
│ ├── final_model_20260308_110634.pt
|
| 158 |
+
│ ├── e3_01b_same_area_neg_075.yaml
|
| 159 |
+
│ ├── README.md
|
| 160 |
+
│ └── README(zh-tw).md
|
| 161 |
+
├── dinov3_vitsplus_efficient/ # 最高效率模型 (81.1%)
|
| 162 |
+
│ ├── best_model_20260306_233824.pt
|
| 163 |
+
│ ├── final_model_20260306_233824.pt
|
| 164 |
+
│ ├── dinov3_vitsplus_tune_02_p2lr5_4ph.yaml
|
| 165 |
+
│ ├── README.md
|
| 166 |
+
│ └── README(zh-tw).md
|
| 167 |
+
└── legacy/ # 舊架構模型 (torch.hub, 224×224)
|
| 168 |
+
├── dinov2_coral_best_model_20251015_165008.pt # 64.43%(舊最佳,同區域負樣本,完整圖片)
|
| 169 |
+
└── dinov2_coral_best_model_20251016_133229.pt # 63.31%(無同區域負樣本,完整圖片)
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
## 授權條款
|
| 173 |
+
|
| 174 |
+
本專案採用 GPL-3.0 授權。
|
| 175 |
+
|
| 176 |
+
基於 Meta Platforms, Inc. 的 DINOv2 與 DINOv3(Apache License 2.0)。
|
README.md
CHANGED
|
@@ -1,3 +1,176 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: gpl-3.0
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: gpl-3.0
|
| 3 |
+
tags:
|
| 4 |
+
- coral-reef
|
| 5 |
+
- re-identification
|
| 6 |
+
- metric-learning
|
| 7 |
+
- dinov2
|
| 8 |
+
- dinov3
|
| 9 |
+
- pytorch
|
| 10 |
+
datasets:
|
| 11 |
+
- custom
|
| 12 |
+
pipeline_tag: image-feature-extraction
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Coral Re-Identification Models
|
| 16 |
+
|
| 17 |
+
Fine-tuned models for underwater coral individual re-identification across multiple years.
|
| 18 |
+
|
| 19 |
+
This repository contains the two best models and standalone inference scripts. No dependency on the training codebase (`coral_reid`) is required.
|
| 20 |
+
|
| 21 |
+
> Source code: [GitHub](https://github.com/YuC13600/coral_models)
|
| 22 |
+
|
| 23 |
+
## Best Models
|
| 24 |
+
|
| 25 |
+
### Best Accuracy — E3-01b DINOv2 ViT-B/14
|
| 26 |
+
|
| 27 |
+
| | |
|
| 28 |
+
|---|---|
|
| 29 |
+
| **N-Benchmark Top-1** | **86.6%** (110/127) |
|
| 30 |
+
| Top-3 / Top-5 / Top-10 | 96.9% / 97.6% / 100.0% |
|
| 31 |
+
| Avg Rank / Worst Rank | 1.30 / 9 |
|
| 32 |
+
| Backbone | DINOv2 ViT-B/14 (86.6M params, timm 518×518) |
|
| 33 |
+
| Loss | Triplet (margin=0.3) + Hard Mining |
|
| 34 |
+
| Sampler | AreaAwareSampler (area_ratio=0.75) |
|
| 35 |
+
| Training | 4-phase progressive unfreezing, 56 epochs, ~7.2h |
|
| 36 |
+
| Embedding | 1280-d, L2-normalized |
|
| 37 |
+
| Files | `e3_01b_dinov2_vitb_best/` |
|
| 38 |
+
|
| 39 |
+
### Most Efficient — DINOv3 ViT-S+/16
|
| 40 |
+
|
| 41 |
+
| | |
|
| 42 |
+
|---|---|
|
| 43 |
+
| **N-Benchmark Top-1** | **81.1%** (103/127) |
|
| 44 |
+
| Top-3 / Top-5 / Top-10 | 92.1% / 95.3% / 99.2% |
|
| 45 |
+
| Avg Rank | 1.61 |
|
| 46 |
+
| Backbone | DINOv3 ViT-S+/16 (~22M params, timm 512×512) |
|
| 47 |
+
| Loss | Triplet (margin=0.3) + Hard Mining |
|
| 48 |
+
| Sampler | MPerClassSampler (m=2) |
|
| 49 |
+
| Training | 4-phase progressive unfreezing, 63 epochs, ~2.0h |
|
| 50 |
+
| Embedding | 768-d, L2-normalized |
|
| 51 |
+
| Files | `dinov3_vitsplus_efficient/` |
|
| 52 |
+
|
| 53 |
+
### Comparison
|
| 54 |
+
|
| 55 |
+
| Metric | Best Accuracy | Most Efficient | Difference |
|
| 56 |
+
|--------|--------------|----------------|------------|
|
| 57 |
+
| Top-1 | 86.6% | 81.1% | -5.5% |
|
| 58 |
+
| Parameters | ~86.6M | ~22M | **-75%** |
|
| 59 |
+
| Model size | 339 MB | 112 MB | **-67%** |
|
| 60 |
+
| Training time | ~7.2h | ~2.0h | **-72%** |
|
| 61 |
+
| Inference tokens | 1369 (patch14) | 1024 (patch16) | -25% |
|
| 62 |
+
|
| 63 |
+
## Quick Start
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
# Install dependencies (standalone, no coral_reid needed)
|
| 67 |
+
uv sync
|
| 68 |
+
|
| 69 |
+
# Extract features from a single image
|
| 70 |
+
uv run python extract_features.py \
|
| 71 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 72 |
+
--input /path/to/image.jpg
|
| 73 |
+
|
| 74 |
+
# Extract features from a directory
|
| 75 |
+
uv run python extract_features.py \
|
| 76 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 77 |
+
--input /path/to/images/ \
|
| 78 |
+
--output features.h5
|
| 79 |
+
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## N-Benchmark Per-Area Results
|
| 83 |
+
|
| 84 |
+
### E3-01b DINOv2 ViT-B/14 (Best)
|
| 85 |
+
|
| 86 |
+
| Area | Queries | Top-1 | Top-3 | Top-5 | Avg Rank |
|
| 87 |
+
|------|---------|-------|-------|-------|----------|
|
| 88 |
+
| 37 | 32 | 93.8% | 96.9% | 96.9% | 1.28 |
|
| 89 |
+
| 38 | 31 | 80.6% | 100.0% | 100.0% | 1.19 |
|
| 90 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.44 |
|
| 91 |
+
| 40 | 37 | 86.5% | 97.3% | 97.3% | 1.30 |
|
| 92 |
+
| **Overall** | **127** | **86.6%** | **96.9%** | **97.6%** | **1.30** |
|
| 93 |
+
|
| 94 |
+
### DINOv3 ViT-S+/16 (Efficient)
|
| 95 |
+
|
| 96 |
+
| Area | Queries | Top-1 | Top-3 | Top-5 | Avg Rank |
|
| 97 |
+
|------|---------|-------|-------|-------|----------|
|
| 98 |
+
| 37 | 32 | 81.2% | 93.8% | 96.9% | 1.56 |
|
| 99 |
+
| 38 | 31 | 77.4% | 90.3% | 93.5% | 1.90 |
|
| 100 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.37 |
|
| 101 |
+
| 40 | 37 | 81.1% | 91.9% | 94.6% | 1.57 |
|
| 102 |
+
| **Overall** | **127** | **81.1%** | **92.1%** | **95.3%** | **1.61** |
|
| 103 |
+
|
| 104 |
+
## Full Model History
|
| 105 |
+
|
| 106 |
+
### Model Comparison Table
|
| 107 |
+
|
| 108 |
+
| Model Name | Arch | Backbone | Loss | Mining | Same Area Neg | Image | Test Acc | Test Loss | Val Loss | N-Bench Avg | A37 | A38 | A39 | A40 | Time |
|
| 109 |
+
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|
| 110 |
+
| Pre-trained | - | DINOv2-B/14 | - | - | - | bbox | - | - | - | 29.48% | 28.12% | 35.48% | 29.63% | 24.32% | - |
|
| 111 |
+
| 20250812_152526 | old | DINOv2-B/14 | Triplet | pre-composed | ❌ | bbox | 92.6% | 0.1659 | - | 48.25% | 50.00% | 51.61% | 48.15% | 43.24% | ~16h |
|
| 112 |
+
| 20251007_133126 | old | DINOv2-B/14 | Triplet | pre-composed | ✅ | bbox | 88.8% | 0.2523 | - | 39.32% | 46.88% | 41.94% | 33.33% | 35.14% | ~16h |
|
| 113 |
+
| 20251008_094017 | old | DINOv2-B/14 | Triplet | pre-composed | ✅ | bbox | 90.4% | 0.1636 | - | 40.19% | 37.50% | 48.39% | 37.04% | 37.84% | ~16h |
|
| 114 |
+
| 20251014_183603 | old | DINOv2-B/14 | Triplet | pre-composed | ❌ | bbox | 92.8% | 0.1012 | - | 40.97% | 37.50% | 38.71% | 44.44% | 43.24% | ~16h |
|
| 115 |
+
| Pre-trained | - | DINOv2-B/14 | - | - | - | whole | - | - | - | 50.88% | 34.38% | 54.84% | 62.96% | 51.35% | - |
|
| 116 |
+
| 20251015_165008 | old | DINOv2-B/14 | Triplet | pre-composed | ✅ | whole | 92.7% | 0.1330 | 0.1006 | 64.43% | 62.50% | 61.29% | 55.56% | 78.38% | ~16h |
|
| 117 |
+
| 20251016_133229 | old | DINOv2-B/14 | Triplet | pre-composed | ❌ | whole | 97.9% | 0.0429 | - | 63.31% | 56.25% | 58.06% | 74.07% | 64.86% | ~16h |
|
| 118 |
+
| **20260308_110634** | **new** | **DINOv2-B/14 (timm 518)** | **Triplet** | **dynamic (PML)** | **AreaAware 0.75** | whole | - | - | **0.1604** | **86.6%** | **93.8%** | **80.6%** | **85.2%** | **86.5%** | **~7.2h** |
|
| 119 |
+
| **20260306_233824** | **new** | **DINOv3-S+/16 (timm 512)** | **Triplet** | **dynamic (PML)** | ❌ | whole | - | - | **0.1604** | **81.1%** | **81.2%** | **77.4%** | **85.2%** | **81.1%** | **~2.0h** |
|
| 120 |
+
|
| 121 |
+
### Column Descriptions
|
| 122 |
+
|
| 123 |
+
| Column | Description |
|
| 124 |
+
| --- | --- |
|
| 125 |
+
| Arch | `old` = old_repo implementation, `new` = refactored modular architecture |
|
| 126 |
+
| Backbone | Feature extractor (DINOv2-B/14, DINOv3-S+/16, etc.) |
|
| 127 |
+
| Loss | Loss function (Triplet, ArcFace, CosFace, Circle, Contrastive, etc.) |
|
| 128 |
+
| Mining | Sample mining: `pre-composed` = fixed triplets, `dynamic (PML)` = MPerClassSampler |
|
| 129 |
+
| Same Area Neg | Whether negatives restricted to same geographic area (`AreaAware 0.75` = 75% same area) |
|
| 130 |
+
| Image | `bbox` = EXIF bounding box crop, `whole` = full image |
|
| 131 |
+
| Test Acc | Test set accuracy (old arch only, measures pos_dist < neg_dist) |
|
| 132 |
+
| Val Loss | Best validation loss during training |
|
| 133 |
+
| N-Bench Avg | N-Benchmark Top-1 accuracy averaged across areas 37-40 |
|
| 134 |
+
|
| 135 |
+
### Architecture Differences
|
| 136 |
+
|
| 137 |
+
| Feature | Old Architecture | New Architecture |
|
| 138 |
+
| --- | --- | --- |
|
| 139 |
+
| Dataset Output | `(anchor, pos, neg)` - 3 images | `(image, label)` - 1 image |
|
| 140 |
+
| Triplet Formation | Pre-composed before training | Dynamic mining per batch |
|
| 141 |
+
| Batch Sampler | Random | MPerClassSampler (m=2) |
|
| 142 |
+
| Loss Function | Custom TripletLossWithMining | PML TripletMarginLoss |
|
| 143 |
+
| Samples per Epoch | ~50,000 triplets x 3 images | ~4,000 images |
|
| 144 |
+
| Training Speed | ~23 min/epoch | ~1.5 min/epoch |
|
| 145 |
+
| Same Area Negatives | Implemented | Implemented (AreaAwareSampler) |
|
| 146 |
+
|
| 147 |
+
> **N-Benchmark (Nearest Benchmark)**: Top-1 accuracy rate of identifying the correct coral when comparing specimens in areas 37-40 across 2022 and 2023.
|
| 148 |
+
|
| 149 |
+
## Project Structure
|
| 150 |
+
|
| 151 |
+
```
|
| 152 |
+
coral_models/
|
| 153 |
+
├── pyproject.toml # uv environment (standalone)
|
| 154 |
+
├── extract_features.py # Feature extraction script
|
| 155 |
+
├── e3_01b_dinov2_vitb_best/ # Best accuracy model (86.6%)
|
| 156 |
+
│ ├── best_model_20260308_110634.pt
|
| 157 |
+
│ ├── final_model_20260308_110634.pt
|
| 158 |
+
│ ├── e3_01b_same_area_neg_075.yaml
|
| 159 |
+
│ ├── README.md
|
| 160 |
+
│ └── README(zh-tw).md
|
| 161 |
+
├── dinov3_vitsplus_efficient/ # Most efficient model (81.1%)
|
| 162 |
+
│ ├── best_model_20260306_233824.pt
|
| 163 |
+
│ ├── final_model_20260306_233824.pt
|
| 164 |
+
│ ├── dinov3_vitsplus_tune_02_p2lr5_4ph.yaml
|
| 165 |
+
│ ├── README.md
|
| 166 |
+
│ └── README(zh-tw).md
|
| 167 |
+
└── legacy/ # Old architecture models (torch.hub, 224×224)
|
| 168 |
+
├── dinov2_coral_best_model_20251015_165008.pt # 64.43% (old best, same area neg, whole image)
|
| 169 |
+
└── dinov2_coral_best_model_20251016_133229.pt # 63.31% (no same area neg, whole image)
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
## License
|
| 173 |
+
|
| 174 |
+
This project is licensed under GPL-3.0.
|
| 175 |
+
|
| 176 |
+
Based on DINOv2 and DINOv3 by Meta Platforms, Inc. (Apache License 2.0).
|
dinov3_vitsplus_efficient/README(zh-tw).md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: gpl-3.0
|
| 3 |
+
tags:
|
| 4 |
+
- coral-reef
|
| 5 |
+
- re-identification
|
| 6 |
+
- metric-learning
|
| 7 |
+
- dinov3
|
| 8 |
+
- pytorch
|
| 9 |
+
datasets:
|
| 10 |
+
- custom
|
| 11 |
+
pipeline_tag: image-feature-extraction
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# 珊瑚個體辨識:DINOv3 ViT-S+/16(高效率)
|
| 15 |
+
|
| 16 |
+
針對水下珊瑚個體辨識微調的 DINOv3 ViT-S+/16 模型。此為本專案中的**最高效率模型**,僅以約 22M 參數和約 2 小時訓練時間,達到 **81.1% N-Benchmark Top-1 準確率**。
|
| 17 |
+
|
| 18 |
+
## 模型規格
|
| 19 |
+
|
| 20 |
+
| | |
|
| 21 |
+
|---|---|
|
| 22 |
+
| **架構** | DINOv3 ViT-S+/16 (~22M 參數) |
|
| 23 |
+
| **Backbone 載入方式** | timm (`vit_small_plus_patch16_dinov3`) |
|
| 24 |
+
| **輸入尺寸** | 512 x 512 |
|
| 25 |
+
| **嵌入維度** | 768 |
|
| 26 |
+
| **Backbone 輸出維度** | 384 |
|
| 27 |
+
| **Head** | MLP (384 → 512 → 768, BatchNorm, Dropout 0.3) |
|
| 28 |
+
|
| 29 |
+
## 訓練配置
|
| 30 |
+
|
| 31 |
+
| | |
|
| 32 |
+
|---|---|
|
| 33 |
+
| **損失函數** | Triplet Loss (margin=0.3) + Hard Mining |
|
| 34 |
+
| **取樣器** | MPerClassSampler (m=2) |
|
| 35 |
+
| **批次大小** | 16(累積步數:8,等效批次:128) |
|
| 36 |
+
| **優化器** | AdamW (weight_decay=1e-4) |
|
| 37 |
+
| **梯度裁剪** | 1.0 |
|
| 38 |
+
| **Early stopping** | patience=6, delta=0.0005 |
|
| 39 |
+
| **總 epochs** | 63 |
|
| 40 |
+
| **訓練時間** | 約 2.0 小時(單 GPU) |
|
| 41 |
+
|
| 42 |
+
### 漸進式解凍(4 階段)
|
| 43 |
+
|
| 44 |
+
| 階段 | 解凍層數 | 學習率 | 最大 Epochs |
|
| 45 |
+
|------|----------|--------|-------------|
|
| 46 |
+
| 1 — 僅 Head | 0(僅 head) | 3e-4 | 20 |
|
| 47 |
+
| 2 — 最後 2 blocks | 2 | 5e-5 | 20 |
|
| 48 |
+
| 3 — 最後 4 blocks | 4 | 1.5e-5 | 15 |
|
| 49 |
+
| 4 — 最後 6 blocks | 6 | 1e-5 | 15 |
|
| 50 |
+
|
| 51 |
+
Phase 2 學習率從預設的 8e-5 降至 5e-5,避免 early stopping 過早觸發,讓 Phase 3 有更好的起點。Phase 4 進一步釋放模型容量。
|
| 52 |
+
|
| 53 |
+
## 評估結果(N-Benchmark)
|
| 54 |
+
|
| 55 |
+
跨年匹配:2022(參考集)vs 2023(查詢集),區域 37-40。
|
| 56 |
+
|
| 57 |
+
| 區域 | 查詢數 | Top-1 | Top-3 | Top-5 | 平均排名 |
|
| 58 |
+
|------|--------|-------|-------|-------|----------|
|
| 59 |
+
| 37 | 32 | 81.2% | 93.8% | 96.9% | 1.56 |
|
| 60 |
+
| 38 | 31 | 77.4% | 90.3% | 93.5% | 1.90 |
|
| 61 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.37 |
|
| 62 |
+
| 40 | 37 | 81.1% | 91.9% | 94.6% | 1.57 |
|
| 63 |
+
| **整體** | **127** | **81.1%** | **92.1%** | **95.3%** | **1.61** |
|
| 64 |
+
|
| 65 |
+
- **驗證損失**:0.1604
|
| 66 |
+
|
| 67 |
+
## 與最強模型的比較
|
| 68 |
+
|
| 69 |
+
| 指標 | 最強模型 (DINOv2 ViT-B) | 本模型 | 差距 |
|
| 70 |
+
|------|------------------------|--------|------|
|
| 71 |
+
| Top-1 | 86.6% | 81.1% | -5.5% |
|
| 72 |
+
| 參數量 | ~86.6M | ~22M | **-75%** |
|
| 73 |
+
| 訓練時間 | ~7.2h | ~2.0h | **-72%** |
|
| 74 |
+
| 模型檔案大小 | 339 MB | 112 MB | **-67%** |
|
| 75 |
+
| 推論 tokens | 1369 (patch14) | 1024 (patch16) | -25% |
|
| 76 |
+
|
| 77 |
+
## 檔案說明
|
| 78 |
+
|
| 79 |
+
| 檔案 | 說明 |
|
| 80 |
+
|------|------|
|
| 81 |
+
| `best_model_20260306_233824.pt` | 最佳 checkpoint(訓練期間最低驗證損失) |
|
| 82 |
+
| `final_model_20260306_233824.pt` | 最終 checkpoint(最後一個 epoch) |
|
| 83 |
+
| `dinov3_vitsplus_tune_02_p2lr5_4ph.yaml` | 完整訓練配置 |
|
| 84 |
+
|
| 85 |
+
## 使用方式
|
| 86 |
+
|
| 87 |
+
```python
|
| 88 |
+
import torch
|
| 89 |
+
from coral_reid.config import ExperimentConfig
|
| 90 |
+
from coral_reid.models.coral_model import CoralReIDModel
|
| 91 |
+
|
| 92 |
+
config = ExperimentConfig.from_yaml("dinov3_vitsplus_tune_02_p2lr5_4ph.yaml")
|
| 93 |
+
model = CoralReIDModel.from_config(config.backbone, config.head)
|
| 94 |
+
model.load("best_model_20260306_233824.pt", map_location="cpu")
|
| 95 |
+
model.eval()
|
| 96 |
+
|
| 97 |
+
# 提取嵌入向量
|
| 98 |
+
embedding = model(image_tensor) # (1, 768)
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
或使用獨立腳本(不需要 `coral_reid`):
|
| 102 |
+
|
| 103 |
+
```bash
|
| 104 |
+
uv run python extract_features.py \
|
| 105 |
+
--model dinov3_vitsplus_efficient/best_model_20260306_233824.pt \
|
| 106 |
+
--input /path/to/image.jpg
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## 引用
|
| 110 |
+
|
| 111 |
+
本模型為珊瑚個體辨識研究的一部分,用於小琉球、綠島及東北角珊瑚礁的長期生態監測。
|
dinov3_vitsplus_efficient/README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: gpl-3.0
|
| 3 |
+
tags:
|
| 4 |
+
- coral-reef
|
| 5 |
+
- re-identification
|
| 6 |
+
- metric-learning
|
| 7 |
+
- dinov3
|
| 8 |
+
- pytorch
|
| 9 |
+
datasets:
|
| 10 |
+
- custom
|
| 11 |
+
pipeline_tag: image-feature-extraction
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Coral Re-ID: DINOv3 ViT-S+/16 (Efficient)
|
| 15 |
+
|
| 16 |
+
Fine-tuned DINOv3 ViT-S+/16 for underwater coral individual re-identification. This is the **most efficient model** in the project, achieving **81.1% N-Benchmark Top-1 accuracy** with only ~22M parameters and ~2h training time.
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
| | |
|
| 21 |
+
|---|---|
|
| 22 |
+
| **Architecture** | DINOv3 ViT-S+/16 (~22M params) |
|
| 23 |
+
| **Backbone loader** | timm (`vit_small_plus_patch16_dinov3`) |
|
| 24 |
+
| **Input size** | 512 x 512 |
|
| 25 |
+
| **Embedding dim** | 768 |
|
| 26 |
+
| **Backbone output dim** | 384 |
|
| 27 |
+
| **Head** | MLP (384 → 512 → 768, BatchNorm, Dropout 0.3) |
|
| 28 |
+
|
| 29 |
+
## Training Configuration
|
| 30 |
+
|
| 31 |
+
| | |
|
| 32 |
+
|---|---|
|
| 33 |
+
| **Loss** | Triplet Loss (margin=0.3) + Hard Mining |
|
| 34 |
+
| **Sampler** | MPerClassSampler (m=2) |
|
| 35 |
+
| **Batch size** | 16 (accumulation steps: 8, effective batch: 128) |
|
| 36 |
+
| **Optimizer** | AdamW (weight_decay=1e-4) |
|
| 37 |
+
| **Gradient clipping** | 1.0 |
|
| 38 |
+
| **Early stopping** | patience=6, delta=0.0005 |
|
| 39 |
+
| **Total epochs** | 63 |
|
| 40 |
+
| **Training time** | ~2.0 hours (single GPU) |
|
| 41 |
+
|
| 42 |
+
### Progressive Unfreezing (4-phase)
|
| 43 |
+
|
| 44 |
+
| Phase | Layers | LR | Max Epochs |
|
| 45 |
+
|-------|--------|----|------------|
|
| 46 |
+
| 1 — Head only | 0 (head only) | 3e-4 | 20 |
|
| 47 |
+
| 2 — Last 2 blocks | 2 | 5e-5 | 20 |
|
| 48 |
+
| 3 — Last 4 blocks | 4 | 1.5e-5 | 15 |
|
| 49 |
+
| 4 — Last 6 blocks | 6 | 1e-5 | 15 |
|
| 50 |
+
|
| 51 |
+
Phase 2 LR was reduced from the default 8e-5 to 5e-5 to prevent early stopping from triggering too soon, giving Phase 3 a better starting point. Phase 4 then further unlocks the model's capacity.
|
| 52 |
+
|
| 53 |
+
## Evaluation Results (N-Benchmark)
|
| 54 |
+
|
| 55 |
+
Cross-year matching: 2022 (reference) vs 2023 (query), areas 37-40.
|
| 56 |
+
|
| 57 |
+
| Area | Queries | Top-1 | Top-3 | Top-5 | Avg Rank |
|
| 58 |
+
|------|---------|-------|-------|-------|----------|
|
| 59 |
+
| 37 | 32 | 81.2% | 93.8% | 96.9% | 1.56 |
|
| 60 |
+
| 38 | 31 | 77.4% | 90.3% | 93.5% | 1.90 |
|
| 61 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.37 |
|
| 62 |
+
| 40 | 37 | 81.1% | 91.9% | 94.6% | 1.57 |
|
| 63 |
+
| **Overall** | **127** | **81.1%** | **92.1%** | **95.3%** | **1.61** |
|
| 64 |
+
|
| 65 |
+
- **Val loss**: 0.1604
|
| 66 |
+
|
| 67 |
+
## Comparison with Best Model
|
| 68 |
+
|
| 69 |
+
| Metric | Best (DINOv2 ViT-B) | This model | Difference |
|
| 70 |
+
|--------|---------------------|------------|------------|
|
| 71 |
+
| Top-1 | 86.6% | 81.1% | -5.5% |
|
| 72 |
+
| Parameters | ~86.6M | ~22M | **-75%** |
|
| 73 |
+
| Training time | ~7.2h | ~2.0h | **-72%** |
|
| 74 |
+
| Model file size | 339 MB | 112 MB | **-67%** |
|
| 75 |
+
| Inference tokens | 1369 (patch14) | 1024 (patch16) | -25% |
|
| 76 |
+
|
| 77 |
+
## Files
|
| 78 |
+
|
| 79 |
+
| File | Description |
|
| 80 |
+
|------|-------------|
|
| 81 |
+
| `best_model_20260306_233824.pt` | Best checkpoint (lowest val loss during training) |
|
| 82 |
+
| `final_model_20260306_233824.pt` | Final checkpoint (last epoch) |
|
| 83 |
+
| `dinov3_vitsplus_tune_02_p2lr5_4ph.yaml` | Full training config |
|
| 84 |
+
|
| 85 |
+
## Usage
|
| 86 |
+
|
| 87 |
+
```python
|
| 88 |
+
import torch
|
| 89 |
+
from coral_reid.config import ExperimentConfig
|
| 90 |
+
from coral_reid.models.coral_model import CoralReIDModel
|
| 91 |
+
|
| 92 |
+
config = ExperimentConfig.from_yaml("dinov3_vitsplus_tune_02_p2lr5_4ph.yaml")
|
| 93 |
+
model = CoralReIDModel.from_config(config.backbone, config.head)
|
| 94 |
+
model.load("best_model_20260306_233824.pt", map_location="cpu")
|
| 95 |
+
model.eval()
|
| 96 |
+
|
| 97 |
+
# Extract embedding
|
| 98 |
+
embedding = model(image_tensor) # (1, 768)
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
Or with the standalone script (no `coral_reid` dependency):
|
| 102 |
+
|
| 103 |
+
```bash
|
| 104 |
+
uv run python extract_features.py \
|
| 105 |
+
--model dinov3_vitsplus_efficient/best_model_20260306_233824.pt \
|
| 106 |
+
--input /path/to/image.jpg
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
## Citation
|
| 110 |
+
|
| 111 |
+
Part of the coral re-identification research for long-term ecological monitoring at Xiaoliuqiu, Green Island, and Northeastern Taiwan.
|
dinov3_vitsplus_efficient/best_model_20260306_233824.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c4f525e9298f0d7b773e4736ad15bffbbfce88f5f5431b7ca4062cc55beed51f
|
| 3 |
+
size 117200249
|
dinov3_vitsplus_efficient/dinov3_vitsplus_tune_02_p2lr5_4ph.yaml
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# DINOv3 ViT-S+ Triplet 調參 Tune-02
|
| 2 |
+
# 問題:同 Tune-01,Phase 2 早收斂 + Phase 3 高基線,且尚未嘗試深化解凍
|
| 3 |
+
# 策略:Tune-01 基礎上新增 Phase 4(6 blocks,LR=1e-5,15ep)
|
| 4 |
+
# 測試在 Phase 2/3 改善後,Phase 4 是否能進一步提升
|
| 5 |
+
# 基準(E1-13):DINOv3 ViT-S+ Triplet 74.0%
|
| 6 |
+
name: dinov3_vitsplus_tune_02_p2lr5_4ph
|
| 7 |
+
seed: 42
|
| 8 |
+
device: cuda
|
| 9 |
+
output_dir: outputs
|
| 10 |
+
|
| 11 |
+
backbone:
|
| 12 |
+
name: timm
|
| 13 |
+
variant: vit_small_plus_patch16_dinov3
|
| 14 |
+
pretrained: true
|
| 15 |
+
freeze: true
|
| 16 |
+
output_dim: 384
|
| 17 |
+
img_size: 512
|
| 18 |
+
|
| 19 |
+
head:
|
| 20 |
+
name: mlp
|
| 21 |
+
input_dim: 384
|
| 22 |
+
hidden_dim: 512
|
| 23 |
+
output_dim: 768
|
| 24 |
+
dropout: 0.3
|
| 25 |
+
use_batchnorm: true
|
| 26 |
+
|
| 27 |
+
loss:
|
| 28 |
+
name: triplet
|
| 29 |
+
margin: 0.3
|
| 30 |
+
mining_strategy: hard
|
| 31 |
+
|
| 32 |
+
data:
|
| 33 |
+
root_dirs:
|
| 34 |
+
- /home/yuc/code/data/coral
|
| 35 |
+
use_whole_image: true
|
| 36 |
+
same_area_negatives: false
|
| 37 |
+
image_size: 512
|
| 38 |
+
train_ratio: 0.7
|
| 39 |
+
val_ratio: 0.15
|
| 40 |
+
test_ratio: 0.15
|
| 41 |
+
num_workers: 4
|
| 42 |
+
|
| 43 |
+
training:
|
| 44 |
+
batch_size: 16
|
| 45 |
+
accumulation_steps: 8
|
| 46 |
+
learning_rate: 0.0003
|
| 47 |
+
weight_decay: 0.0001
|
| 48 |
+
early_stopping_patience: 6
|
| 49 |
+
early_stopping_delta: 0.0005
|
| 50 |
+
scheduler_patience: 3
|
| 51 |
+
scheduler_factor: 0.5
|
| 52 |
+
min_lr: 1.0e-06
|
| 53 |
+
gradient_clip_norm: 1.0
|
| 54 |
+
|
| 55 |
+
phases:
|
| 56 |
+
- name: 'Phase 1: Head Only'
|
| 57 |
+
epochs: 20
|
| 58 |
+
learning_rate: 3.0e-04
|
| 59 |
+
unfreeze_backbone: false
|
| 60 |
+
unfreeze_layers: 0
|
| 61 |
+
- name: 'Phase 2: Head + Last 2 Blocks (LR=5e-5, 20ep)'
|
| 62 |
+
epochs: 20
|
| 63 |
+
learning_rate: 5.0e-05
|
| 64 |
+
unfreeze_backbone: true
|
| 65 |
+
unfreeze_layers: 2
|
| 66 |
+
- name: 'Phase 3: Head + Last 4 Blocks (LR=1.5e-5)'
|
| 67 |
+
epochs: 15
|
| 68 |
+
learning_rate: 1.5e-05
|
| 69 |
+
unfreeze_backbone: true
|
| 70 |
+
unfreeze_layers: 4
|
| 71 |
+
- name: 'Phase 4: Head + Last 6 Blocks (LR=1e-5)'
|
| 72 |
+
epochs: 15
|
| 73 |
+
learning_rate: 1.0e-05
|
| 74 |
+
unfreeze_backbone: true
|
| 75 |
+
unfreeze_layers: 6
|
dinov3_vitsplus_efficient/final_model_20260306_233824.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:58150a259af088911b9e3a03ad50b3feb82a9ee57ef331adc23df61f21f3d3df
|
| 3 |
+
size 117200455
|
e3_01b_dinov2_vitb_best/README(zh-tw).md
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: gpl-3.0
|
| 3 |
+
tags:
|
| 4 |
+
- coral-reef
|
| 5 |
+
- re-identification
|
| 6 |
+
- metric-learning
|
| 7 |
+
- dinov2
|
| 8 |
+
- pytorch
|
| 9 |
+
datasets:
|
| 10 |
+
- custom
|
| 11 |
+
pipeline_tag: image-feature-extraction
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# 珊瑚個體辨識:DINOv2 ViT-B/14(最佳精度)
|
| 15 |
+
|
| 16 |
+
針對水下珊瑚個體辨識微調的 DINOv2 ViT-B/14 模型。此為本專案中的**最強模型**,達到 **86.6% N-Benchmark Top-1 準確率**。
|
| 17 |
+
|
| 18 |
+
## 模型規格
|
| 19 |
+
|
| 20 |
+
| | |
|
| 21 |
+
|---|---|
|
| 22 |
+
| **架構** | DINOv2 ViT-B/14 (86.6M 參數) |
|
| 23 |
+
| **Backbone 載入方式** | timm (`vit_base_patch14_dinov2`) |
|
| 24 |
+
| **輸入尺寸** | 518 x 518 |
|
| 25 |
+
| **嵌入維度** | 1280 |
|
| 26 |
+
| **Backbone 輸出維度** | 768 |
|
| 27 |
+
| **Head** | MLP (768 → 1024 → 1280, BatchNorm, Dropout 0.3) |
|
| 28 |
+
|
| 29 |
+
## 訓練配置
|
| 30 |
+
|
| 31 |
+
| | |
|
| 32 |
+
|---|---|
|
| 33 |
+
| **損失函數** | Triplet Loss (margin=0.3) + Hard Mining |
|
| 34 |
+
| **取樣器** | AreaAwareSampler (area_ratio=0.75) |
|
| 35 |
+
| **批次大小** | 16(累積步數:8,等效批次:128) |
|
| 36 |
+
| **優化器** | AdamW (weight_decay=1e-4) |
|
| 37 |
+
| **梯度裁剪** | 1.0 |
|
| 38 |
+
| **Early stopping** | patience=6, delta=0.0005 |
|
| 39 |
+
| **總 epochs** | 56 |
|
| 40 |
+
| **訓練時間** | 約 7.2 小時(單 GPU) |
|
| 41 |
+
|
| 42 |
+
### 漸進式解凍(4 階段)
|
| 43 |
+
|
| 44 |
+
| 階段 | 解凍層數 | 學習率 | 最大 Epochs |
|
| 45 |
+
|------|----------|--------|-------------|
|
| 46 |
+
| 1 — 僅 Head | 0(僅 head) | 3e-4 | 20 |
|
| 47 |
+
| 2 — 最後 2 blocks | 2 | 8e-5 | 15 |
|
| 48 |
+
| 3 — 最後 4 blocks | 4 | 3e-5 | 12 |
|
| 49 |
+
| 4 — 最後 6 blocks | 6 | 1e-5 | 15 |
|
| 50 |
+
|
| 51 |
+
### AreaAwareSampler
|
| 52 |
+
|
| 53 |
+
每個訓練批次由 75% 同區域珊瑚與 25% 跨區域珊瑚組成。此設計對齊 N-Benchmark 的評估方式(區域內匹配),提供來自同一珊瑚礁區域中視覺上相似的更困難負樣本。
|
| 54 |
+
|
| 55 |
+
## 評估結果(N-Benchmark)
|
| 56 |
+
|
| 57 |
+
跨年匹配:2022(參考集)vs 2023(查詢集),區域 37-40。
|
| 58 |
+
|
| 59 |
+
| 區域 | 查詢數 | Top-1 | Top-3 | Top-5 | 平均排名 |
|
| 60 |
+
|------|--------|-------|-------|-------|----------|
|
| 61 |
+
| 37 | 32 | 93.8% | 96.9% | 96.9% | 1.28 |
|
| 62 |
+
| 38 | 31 | 80.6% | 100.0% | 100.0% | 1.19 |
|
| 63 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.44 |
|
| 64 |
+
| 40 | 37 | 86.5% | 97.3% | 97.3% | 1.30 |
|
| 65 |
+
| **整體** | **127** | **86.6%** | **96.9%** | **97.6%** | **1.30** |
|
| 66 |
+
|
| 67 |
+
- **最差排名**:9(所有正確匹配均在前 9 名內)
|
| 68 |
+
- **驗證損失**:0.1604
|
| 69 |
+
|
| 70 |
+
## 檔案說明
|
| 71 |
+
|
| 72 |
+
| 檔案 | 說明 |
|
| 73 |
+
|------|------|
|
| 74 |
+
| `best_model_20260308_110634.pt` | 最佳 checkpoint(訓練期間最低驗證損失) |
|
| 75 |
+
| `final_model_20260308_110634.pt` | 最終 checkpoint(最後一個 epoch) |
|
| 76 |
+
| `e3_01b_same_area_neg_075.yaml` | 完整訓練配置 |
|
| 77 |
+
|
| 78 |
+
## 使用方式
|
| 79 |
+
|
| 80 |
+
```python
|
| 81 |
+
import torch
|
| 82 |
+
from coral_reid.config import ExperimentConfig
|
| 83 |
+
from coral_reid.models.coral_model import CoralReIDModel
|
| 84 |
+
|
| 85 |
+
config = ExperimentConfig.from_yaml("e3_01b_same_area_neg_075.yaml")
|
| 86 |
+
model = CoralReIDModel.from_config(config.backbone, config.head)
|
| 87 |
+
model.load("best_model_20260308_110634.pt", map_location="cpu")
|
| 88 |
+
model.eval()
|
| 89 |
+
|
| 90 |
+
# 提取嵌入向量
|
| 91 |
+
embedding = model(image_tensor) # (1, 1280)
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
或使用獨立腳本(不需要 `coral_reid`):
|
| 95 |
+
|
| 96 |
+
```bash
|
| 97 |
+
uv run python extract_features.py \
|
| 98 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 99 |
+
--input /path/to/image.jpg
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
## 引用
|
| 103 |
+
|
| 104 |
+
本模型為珊瑚個體辨識研究的一部分,用於小琉球、綠島及東北角珊瑚礁的長期生態監測。
|
e3_01b_dinov2_vitb_best/README.md
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: gpl-3.0
|
| 3 |
+
tags:
|
| 4 |
+
- coral-reef
|
| 5 |
+
- re-identification
|
| 6 |
+
- metric-learning
|
| 7 |
+
- dinov2
|
| 8 |
+
- pytorch
|
| 9 |
+
datasets:
|
| 10 |
+
- custom
|
| 11 |
+
pipeline_tag: image-feature-extraction
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Coral Re-ID: DINOv2 ViT-B/14 (Best Accuracy)
|
| 15 |
+
|
| 16 |
+
Fine-tuned DINOv2 ViT-B/14 for underwater coral individual re-identification. This is the **strongest model** in the project, achieving **86.6% N-Benchmark Top-1 accuracy**.
|
| 17 |
+
|
| 18 |
+
## Model Details
|
| 19 |
+
|
| 20 |
+
| | |
|
| 21 |
+
|---|---|
|
| 22 |
+
| **Architecture** | DINOv2 ViT-B/14 (86.6M params) |
|
| 23 |
+
| **Backbone loader** | timm (`vit_base_patch14_dinov2`) |
|
| 24 |
+
| **Input size** | 518 x 518 |
|
| 25 |
+
| **Embedding dim** | 1280 |
|
| 26 |
+
| **Backbone output dim** | 768 |
|
| 27 |
+
| **Head** | MLP (768 → 1024 → 1280, BatchNorm, Dropout 0.3) |
|
| 28 |
+
|
| 29 |
+
## Training Configuration
|
| 30 |
+
|
| 31 |
+
| | |
|
| 32 |
+
|---|---|
|
| 33 |
+
| **Loss** | Triplet Loss (margin=0.3) + Hard Mining |
|
| 34 |
+
| **Sampler** | AreaAwareSampler (area_ratio=0.75) |
|
| 35 |
+
| **Batch size** | 16 (accumulation steps: 8, effective batch: 128) |
|
| 36 |
+
| **Optimizer** | AdamW (weight_decay=1e-4) |
|
| 37 |
+
| **Gradient clipping** | 1.0 |
|
| 38 |
+
| **Early stopping** | patience=6, delta=0.0005 |
|
| 39 |
+
| **Total epochs** | 56 |
|
| 40 |
+
| **Training time** | ~7.2 hours (single GPU) |
|
| 41 |
+
|
| 42 |
+
### Progressive Unfreezing (4-phase)
|
| 43 |
+
|
| 44 |
+
| Phase | Layers | LR | Max Epochs |
|
| 45 |
+
|-------|--------|----|------------|
|
| 46 |
+
| 1 — Head only | 0 (head only) | 3e-4 | 20 |
|
| 47 |
+
| 2 — Last 2 blocks | 2 | 8e-5 | 15 |
|
| 48 |
+
| 3 — Last 4 blocks | 4 | 3e-5 | 12 |
|
| 49 |
+
| 4 — Last 6 blocks | 6 | 1e-5 | 15 |
|
| 50 |
+
|
| 51 |
+
### AreaAwareSampler
|
| 52 |
+
|
| 53 |
+
Each training batch is composed of 75% same-area corals and 25% cross-area corals. This aligns training distribution with the N-Benchmark evaluation protocol (within-area matching), providing harder negative examples from visually similar corals in the same reef area.
|
| 54 |
+
|
| 55 |
+
## Evaluation Results (N-Benchmark)
|
| 56 |
+
|
| 57 |
+
Cross-year matching: 2022 (reference) vs 2023 (query), areas 37-40.
|
| 58 |
+
|
| 59 |
+
| Area | Queries | Top-1 | Top-3 | Top-5 | Avg Rank |
|
| 60 |
+
|------|---------|-------|-------|-------|----------|
|
| 61 |
+
| 37 | 32 | 93.8% | 96.9% | 96.9% | 1.28 |
|
| 62 |
+
| 38 | 31 | 80.6% | 100.0% | 100.0% | 1.19 |
|
| 63 |
+
| 39 | 27 | 85.2% | 92.6% | 96.3% | 1.44 |
|
| 64 |
+
| 40 | 37 | 86.5% | 97.3% | 97.3% | 1.30 |
|
| 65 |
+
| **Overall** | **127** | **86.6%** | **96.9%** | **97.6%** | **1.30** |
|
| 66 |
+
|
| 67 |
+
- **Worst rank**: 9 (all correct matches within top 9)
|
| 68 |
+
- **Val loss**: 0.1604
|
| 69 |
+
|
| 70 |
+
## Files
|
| 71 |
+
|
| 72 |
+
| File | Description |
|
| 73 |
+
|------|-------------|
|
| 74 |
+
| `best_model_20260308_110634.pt` | Best checkpoint (lowest val loss during training) |
|
| 75 |
+
| `final_model_20260308_110634.pt` | Final checkpoint (last epoch) |
|
| 76 |
+
| `e3_01b_same_area_neg_075.yaml` | Full training config |
|
| 77 |
+
|
| 78 |
+
## Usage
|
| 79 |
+
|
| 80 |
+
```python
|
| 81 |
+
import torch
|
| 82 |
+
from coral_reid.config import ExperimentConfig
|
| 83 |
+
from coral_reid.models.coral_model import CoralReIDModel
|
| 84 |
+
|
| 85 |
+
config = ExperimentConfig.from_yaml("e3_01b_same_area_neg_075.yaml")
|
| 86 |
+
model = CoralReIDModel.from_config(config.backbone, config.head)
|
| 87 |
+
model.load("best_model_20260308_110634.pt", map_location="cpu")
|
| 88 |
+
model.eval()
|
| 89 |
+
|
| 90 |
+
# Extract embedding
|
| 91 |
+
embedding = model(image_tensor) # (1, 1280)
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
Or with the standalone script (no `coral_reid` dependency):
|
| 95 |
+
|
| 96 |
+
```bash
|
| 97 |
+
uv run python extract_features.py \
|
| 98 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 99 |
+
--input /path/to/image.jpg
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
## Citation
|
| 103 |
+
|
| 104 |
+
Part of the coral re-identification research for long-term ecological monitoring at Xiaoliuqiu, Green Island, and Northeastern Taiwan.
|
e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b77d102cabc4611df9227f4fc1be9ddeb134459265259f0783ff7f28a8323cc0
|
| 3 |
+
size 354830189
|
e3_01b_dinov2_vitb_best/e3_01b_same_area_neg_075.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# E3-01b: Same Area Negatives (area_ratio=0.75)
|
| 2 |
+
# 基準:dinov2_vitb_tune_02_4ph (84.3%)
|
| 3 |
+
# 策略:AreaAwareSampler,每 batch 75% 同區域 + 25% 跨區域(更強 hard negatives)
|
| 4 |
+
name: e3_01b_same_area_neg_075
|
| 5 |
+
seed: 42
|
| 6 |
+
device: cuda
|
| 7 |
+
output_dir: outputs
|
| 8 |
+
|
| 9 |
+
backbone:
|
| 10 |
+
name: timm
|
| 11 |
+
variant: vit_base_patch14_dinov2
|
| 12 |
+
pretrained: true
|
| 13 |
+
freeze: true
|
| 14 |
+
output_dim: 768
|
| 15 |
+
img_size: 518
|
| 16 |
+
|
| 17 |
+
head:
|
| 18 |
+
name: mlp
|
| 19 |
+
input_dim: 768
|
| 20 |
+
hidden_dim: 1024
|
| 21 |
+
output_dim: 1280
|
| 22 |
+
dropout: 0.3
|
| 23 |
+
use_batchnorm: true
|
| 24 |
+
|
| 25 |
+
loss:
|
| 26 |
+
name: triplet
|
| 27 |
+
margin: 0.3
|
| 28 |
+
mining_strategy: hard
|
| 29 |
+
|
| 30 |
+
data:
|
| 31 |
+
root_dirs:
|
| 32 |
+
- /home/yuc/code/data/coral
|
| 33 |
+
use_whole_image: true
|
| 34 |
+
same_area_negatives: true
|
| 35 |
+
area_ratio: 0.75
|
| 36 |
+
image_size: 518
|
| 37 |
+
train_ratio: 0.7
|
| 38 |
+
val_ratio: 0.15
|
| 39 |
+
test_ratio: 0.15
|
| 40 |
+
num_workers: 4
|
| 41 |
+
|
| 42 |
+
training:
|
| 43 |
+
batch_size: 16
|
| 44 |
+
accumulation_steps: 8
|
| 45 |
+
learning_rate: 0.0003
|
| 46 |
+
weight_decay: 0.0001
|
| 47 |
+
early_stopping_patience: 6
|
| 48 |
+
early_stopping_delta: 0.0005
|
| 49 |
+
scheduler_patience: 3
|
| 50 |
+
scheduler_factor: 0.5
|
| 51 |
+
min_lr: 1.0e-06
|
| 52 |
+
gradient_clip_norm: 1.0
|
| 53 |
+
|
| 54 |
+
phases:
|
| 55 |
+
- name: 'Phase 1: Head Only'
|
| 56 |
+
epochs: 20
|
| 57 |
+
learning_rate: 3.0e-04
|
| 58 |
+
unfreeze_backbone: false
|
| 59 |
+
unfreeze_layers: 0
|
| 60 |
+
- name: 'Phase 2: Head + Last 2 Blocks'
|
| 61 |
+
epochs: 15
|
| 62 |
+
learning_rate: 8.0e-05
|
| 63 |
+
unfreeze_backbone: true
|
| 64 |
+
unfreeze_layers: 2
|
| 65 |
+
- name: 'Phase 3: Head + Last 4 Blocks'
|
| 66 |
+
epochs: 12
|
| 67 |
+
learning_rate: 3.0e-05
|
| 68 |
+
unfreeze_backbone: true
|
| 69 |
+
unfreeze_layers: 4
|
| 70 |
+
- name: 'Phase 4: Head + Last 6 Blocks (LR=1e-5)'
|
| 71 |
+
epochs: 15
|
| 72 |
+
learning_rate: 1.0e-05
|
| 73 |
+
unfreeze_backbone: true
|
| 74 |
+
unfreeze_layers: 6
|
e3_01b_dinov2_vitb_best/final_model_20260308_110634.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6537022312141ca144fdd3c032826a4cbcb3a0117047df210e1bc07838d5996
|
| 3 |
+
size 354830383
|
extract_features.py
ADDED
|
@@ -0,0 +1,486 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Standalone feature extraction for coral re-identification models.
|
| 2 |
+
|
| 3 |
+
Reconstructs the model architecture from checkpoint metadata (or a YAML config
|
| 4 |
+
as fallback) and loads weights without depending on the coral_reid package.
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
# Extract features from a directory of images
|
| 8 |
+
uv run python extract_features.py \
|
| 9 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 10 |
+
--input /path/to/images \
|
| 11 |
+
--output features.h5
|
| 12 |
+
|
| 13 |
+
# Extract features for N-Benchmark (by area)
|
| 14 |
+
uv run python extract_features.py \
|
| 15 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 16 |
+
--input /path/to/2022sample \
|
| 17 |
+
--areas 37 38 39 40 \
|
| 18 |
+
--output features/
|
| 19 |
+
|
| 20 |
+
# Single image embedding (prints to stdout)
|
| 21 |
+
uv run python extract_features.py \
|
| 22 |
+
--model e3_01b_dinov2_vitb_best/best_model_20260308_110634.pt \
|
| 23 |
+
--input /path/to/single_image.jpg
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import argparse
|
| 29 |
+
import logging
|
| 30 |
+
import os
|
| 31 |
+
from dataclasses import dataclass
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
|
| 34 |
+
import h5py
|
| 35 |
+
import numpy as np
|
| 36 |
+
import timm
|
| 37 |
+
import torch
|
| 38 |
+
import torch.nn as nn
|
| 39 |
+
import torch.nn.functional as F
|
| 40 |
+
import yaml
|
| 41 |
+
from PIL import Image
|
| 42 |
+
from torchvision import transforms
|
| 43 |
+
from tqdm import tqdm
|
| 44 |
+
|
| 45 |
+
logging.basicConfig(
|
| 46 |
+
level=logging.INFO,
|
| 47 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 48 |
+
)
|
| 49 |
+
logger = logging.getLogger(__name__)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# ---------------------------------------------------------------------------
|
| 53 |
+
# Configuration
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class ModelConfig:
|
| 59 |
+
"""Model configuration parsed from YAML."""
|
| 60 |
+
|
| 61 |
+
# Backbone
|
| 62 |
+
backbone_variant: str
|
| 63 |
+
img_size: int
|
| 64 |
+
backbone_output_dim: int
|
| 65 |
+
|
| 66 |
+
# Head
|
| 67 |
+
hidden_dim: int
|
| 68 |
+
output_dim: int
|
| 69 |
+
dropout: float
|
| 70 |
+
use_batchnorm: bool
|
| 71 |
+
|
| 72 |
+
@classmethod
|
| 73 |
+
def from_dict(cls, d: dict) -> ModelConfig:
|
| 74 |
+
"""Create config from a dict (embedded in checkpoint)."""
|
| 75 |
+
return cls(
|
| 76 |
+
backbone_variant=d["backbone_variant"],
|
| 77 |
+
img_size=d.get("img_size", 224),
|
| 78 |
+
backbone_output_dim=d["backbone_output_dim"],
|
| 79 |
+
hidden_dim=d["hidden_dim"],
|
| 80 |
+
output_dim=d["output_dim"],
|
| 81 |
+
dropout=d.get("dropout", 0.3),
|
| 82 |
+
use_batchnorm=d.get("use_batchnorm", True),
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
@classmethod
|
| 86 |
+
def from_yaml(cls, path: str | Path) -> ModelConfig:
|
| 87 |
+
with open(path) as f:
|
| 88 |
+
cfg = yaml.safe_load(f)
|
| 89 |
+
|
| 90 |
+
backbone = cfg["backbone"]
|
| 91 |
+
head = cfg["head"]
|
| 92 |
+
|
| 93 |
+
return cls(
|
| 94 |
+
backbone_variant=backbone["variant"],
|
| 95 |
+
img_size=backbone.get("img_size", 224),
|
| 96 |
+
backbone_output_dim=backbone["output_dim"],
|
| 97 |
+
hidden_dim=head["hidden_dim"],
|
| 98 |
+
output_dim=head["output_dim"],
|
| 99 |
+
dropout=head.get("dropout", 0.3),
|
| 100 |
+
use_batchnorm=head.get("use_batchnorm", True),
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
# Model Architecture (standalone reconstruction)
|
| 106 |
+
# ---------------------------------------------------------------------------
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class MLPHead(nn.Module):
|
| 110 |
+
"""MLP projection head with L2 normalization.
|
| 111 |
+
|
| 112 |
+
Architecture:
|
| 113 |
+
BatchNorm1d → Dropout(0.2)
|
| 114 |
+
→ Linear → ReLU → Dropout → Linear → [BatchNorm1d]
|
| 115 |
+
→ L2 Normalize
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
def __init__(
|
| 119 |
+
self,
|
| 120 |
+
input_dim: int,
|
| 121 |
+
hidden_dim: int,
|
| 122 |
+
output_dim: int,
|
| 123 |
+
dropout: float = 0.3,
|
| 124 |
+
use_batchnorm: bool = True,
|
| 125 |
+
) -> None:
|
| 126 |
+
super().__init__()
|
| 127 |
+
|
| 128 |
+
self.feature_processor = nn.Sequential(
|
| 129 |
+
nn.BatchNorm1d(input_dim),
|
| 130 |
+
nn.Dropout(p=0.2),
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
layers: list[nn.Module] = [
|
| 134 |
+
nn.Linear(input_dim, hidden_dim),
|
| 135 |
+
nn.ReLU(),
|
| 136 |
+
nn.Dropout(p=dropout),
|
| 137 |
+
nn.Linear(hidden_dim, output_dim),
|
| 138 |
+
]
|
| 139 |
+
if use_batchnorm:
|
| 140 |
+
layers.append(nn.BatchNorm1d(output_dim))
|
| 141 |
+
|
| 142 |
+
self.projection = nn.Sequential(*layers)
|
| 143 |
+
|
| 144 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 145 |
+
x = self.feature_processor(x)
|
| 146 |
+
x = self.projection(x)
|
| 147 |
+
return F.normalize(x, p=2, dim=1)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
class CoralReIDModel(nn.Module):
|
| 151 |
+
"""Coral re-identification model: timm backbone + MLP head."""
|
| 152 |
+
|
| 153 |
+
def __init__(self, config: ModelConfig) -> None:
|
| 154 |
+
super().__init__()
|
| 155 |
+
|
| 156 |
+
# Backbone: timm model with classification head removed
|
| 157 |
+
self.backbone = timm.create_model(
|
| 158 |
+
config.backbone_variant,
|
| 159 |
+
pretrained=False, # weights come from checkpoint
|
| 160 |
+
num_classes=0,
|
| 161 |
+
img_size=config.img_size,
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
self.head = MLPHead(
|
| 165 |
+
input_dim=config.backbone_output_dim,
|
| 166 |
+
hidden_dim=config.hidden_dim,
|
| 167 |
+
output_dim=config.output_dim,
|
| 168 |
+
dropout=config.dropout,
|
| 169 |
+
use_batchnorm=config.use_batchnorm,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 173 |
+
features = self.backbone(x)
|
| 174 |
+
return self.head(features)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def load_model(
|
| 178 |
+
checkpoint_path: str | Path,
|
| 179 |
+
device: str | torch.device = "cpu",
|
| 180 |
+
config_path: str | Path | None = None,
|
| 181 |
+
) -> tuple[CoralReIDModel, ModelConfig]:
|
| 182 |
+
"""Load model from checkpoint file.
|
| 183 |
+
|
| 184 |
+
Model config is read from the checkpoint's ``model_config`` key.
|
| 185 |
+
If the checkpoint doesn't contain it, ``config_path`` (YAML) is used
|
| 186 |
+
as a fallback.
|
| 187 |
+
|
| 188 |
+
Args:
|
| 189 |
+
checkpoint_path: Path to the .pt checkpoint file.
|
| 190 |
+
device: Device to load the model on.
|
| 191 |
+
config_path: Optional path to a YAML config (fallback).
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Tuple of (model, config).
|
| 195 |
+
"""
|
| 196 |
+
# Checkpoint is a dict with "model_state_dict" key
|
| 197 |
+
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
|
| 198 |
+
|
| 199 |
+
# Resolve config: checkpoint-embedded > YAML fallback
|
| 200 |
+
if isinstance(checkpoint, dict) and "model_config" in checkpoint:
|
| 201 |
+
config = ModelConfig.from_dict(checkpoint["model_config"])
|
| 202 |
+
elif config_path is not None:
|
| 203 |
+
config = ModelConfig.from_yaml(config_path)
|
| 204 |
+
else:
|
| 205 |
+
raise ValueError(
|
| 206 |
+
"Checkpoint does not contain model_config and no --config provided. "
|
| 207 |
+
"Use embed_config.py to add config to the checkpoint, or pass --config."
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
model = CoralReIDModel(config)
|
| 211 |
+
|
| 212 |
+
if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
|
| 213 |
+
state_dict = checkpoint["model_state_dict"]
|
| 214 |
+
else:
|
| 215 |
+
# Fallback: raw state_dict
|
| 216 |
+
state_dict = checkpoint
|
| 217 |
+
|
| 218 |
+
# Map keys: original uses "backbone.model.*", timm direct uses "backbone.*"
|
| 219 |
+
mapped_state_dict: dict[str, torch.Tensor] = {}
|
| 220 |
+
for key, value in state_dict.items():
|
| 221 |
+
if key.startswith("backbone.model."):
|
| 222 |
+
new_key = key.replace("backbone.model.", "backbone.", 1)
|
| 223 |
+
else:
|
| 224 |
+
new_key = key
|
| 225 |
+
mapped_state_dict[new_key] = value
|
| 226 |
+
|
| 227 |
+
model.load_state_dict(mapped_state_dict)
|
| 228 |
+
model.to(device)
|
| 229 |
+
model.eval()
|
| 230 |
+
|
| 231 |
+
logger.info(
|
| 232 |
+
f"Loaded model: {config.backbone_variant} "
|
| 233 |
+
f"({config.img_size}px, {config.output_dim}d embedding)"
|
| 234 |
+
)
|
| 235 |
+
return model, config
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
# ---------------------------------------------------------------------------
|
| 239 |
+
# Inference Transforms
|
| 240 |
+
# ---------------------------------------------------------------------------
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def get_inference_transforms(image_size: int) -> transforms.Compose:
|
| 244 |
+
"""Create inference transforms matching training pipeline."""
|
| 245 |
+
return transforms.Compose([
|
| 246 |
+
transforms.Resize(
|
| 247 |
+
(image_size, image_size),
|
| 248 |
+
interpolation=transforms.InterpolationMode.BICUBIC,
|
| 249 |
+
),
|
| 250 |
+
transforms.ToTensor(),
|
| 251 |
+
transforms.Normalize(
|
| 252 |
+
mean=[0.485, 0.456, 0.406],
|
| 253 |
+
std=[0.229, 0.224, 0.225],
|
| 254 |
+
),
|
| 255 |
+
])
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# ---------------------------------------------------------------------------
|
| 259 |
+
# Feature Extraction
|
| 260 |
+
# ---------------------------------------------------------------------------
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
@torch.no_grad()
|
| 264 |
+
def extract_single(
|
| 265 |
+
model: CoralReIDModel,
|
| 266 |
+
img_path: str | Path,
|
| 267 |
+
transform: transforms.Compose,
|
| 268 |
+
device: str | torch.device,
|
| 269 |
+
) -> np.ndarray | None:
|
| 270 |
+
"""Extract feature embedding from a single image."""
|
| 271 |
+
try:
|
| 272 |
+
img = Image.open(img_path).convert("RGB")
|
| 273 |
+
tensor = transform(img).unsqueeze(0).to(device)
|
| 274 |
+
embedding = model(tensor)
|
| 275 |
+
return embedding.cpu().numpy().flatten()
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.warning(f"Failed to process {img_path}: {e}")
|
| 278 |
+
return None
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
@torch.no_grad()
|
| 282 |
+
def extract_directory(
|
| 283 |
+
model: CoralReIDModel,
|
| 284 |
+
directory: str | Path,
|
| 285 |
+
transform: transforms.Compose,
|
| 286 |
+
device: str | torch.device,
|
| 287 |
+
batch_size: int = 32,
|
| 288 |
+
) -> tuple[np.ndarray, list[str]]:
|
| 289 |
+
"""Extract features from all images in a directory.
|
| 290 |
+
|
| 291 |
+
Returns:
|
| 292 |
+
Tuple of (features array [N, D], list of coral names).
|
| 293 |
+
"""
|
| 294 |
+
directory = Path(directory)
|
| 295 |
+
image_files = sorted(
|
| 296 |
+
f
|
| 297 |
+
for f in os.listdir(directory)
|
| 298 |
+
if f.lower().endswith((".jpg", ".jpeg", ".png"))
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
if not image_files:
|
| 302 |
+
logger.warning(f"No images found in {directory}")
|
| 303 |
+
return np.array([]), []
|
| 304 |
+
|
| 305 |
+
features_list: list[np.ndarray] = []
|
| 306 |
+
coral_names: list[str] = []
|
| 307 |
+
|
| 308 |
+
for i in tqdm(range(0, len(image_files), batch_size), desc=str(directory)):
|
| 309 |
+
batch_files = image_files[i : i + batch_size]
|
| 310 |
+
batch_tensors: list[torch.Tensor] = []
|
| 311 |
+
batch_names: list[str] = []
|
| 312 |
+
|
| 313 |
+
for fname in batch_files:
|
| 314 |
+
try:
|
| 315 |
+
img = Image.open(directory / fname).convert("RGB")
|
| 316 |
+
batch_tensors.append(transform(img))
|
| 317 |
+
batch_names.append(os.path.splitext(fname)[0])
|
| 318 |
+
except Exception as e:
|
| 319 |
+
logger.warning(f"Skipping {fname}: {e}")
|
| 320 |
+
|
| 321 |
+
if batch_tensors:
|
| 322 |
+
batch = torch.stack(batch_tensors).to(device)
|
| 323 |
+
feats = model(batch).cpu().numpy()
|
| 324 |
+
features_list.append(feats)
|
| 325 |
+
coral_names.extend(batch_names)
|
| 326 |
+
|
| 327 |
+
if features_list:
|
| 328 |
+
features = np.concatenate(features_list, axis=0)
|
| 329 |
+
else:
|
| 330 |
+
features = np.array([])
|
| 331 |
+
|
| 332 |
+
return features, coral_names
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def save_features_h5(
|
| 336 |
+
path: str | Path,
|
| 337 |
+
features: np.ndarray,
|
| 338 |
+
coral_names: list[str],
|
| 339 |
+
metadata: dict[str, str | int | float] | None = None,
|
| 340 |
+
) -> None:
|
| 341 |
+
"""Save features to HDF5 file."""
|
| 342 |
+
path = Path(path)
|
| 343 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 344 |
+
|
| 345 |
+
with h5py.File(path, "w") as f:
|
| 346 |
+
f.create_dataset("features", data=features)
|
| 347 |
+
f.create_dataset(
|
| 348 |
+
"coral_names",
|
| 349 |
+
data=[name.encode("utf-8") for name in coral_names],
|
| 350 |
+
)
|
| 351 |
+
f.attrs["feature_dim"] = features.shape[1] if len(features.shape) > 1 else 0
|
| 352 |
+
f.attrs["num_samples"] = features.shape[0]
|
| 353 |
+
|
| 354 |
+
if metadata:
|
| 355 |
+
for key, value in metadata.items():
|
| 356 |
+
if value is not None:
|
| 357 |
+
f.attrs[key] = value
|
| 358 |
+
|
| 359 |
+
logger.info(f"Saved {len(coral_names)} features to {path}")
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
# ---------------------------------------------------------------------------
|
| 363 |
+
# CLI
|
| 364 |
+
# ---------------------------------------------------------------------------
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def parse_args() -> argparse.Namespace:
|
| 368 |
+
parser = argparse.ArgumentParser(
|
| 369 |
+
description="Standalone feature extraction for coral re-identification models",
|
| 370 |
+
)
|
| 371 |
+
parser.add_argument(
|
| 372 |
+
"--model",
|
| 373 |
+
required=True,
|
| 374 |
+
help="Path to model checkpoint (.pt)",
|
| 375 |
+
)
|
| 376 |
+
parser.add_argument(
|
| 377 |
+
"--config",
|
| 378 |
+
default=None,
|
| 379 |
+
help="Path to YAML config file (optional if config is embedded in checkpoint)",
|
| 380 |
+
)
|
| 381 |
+
parser.add_argument(
|
| 382 |
+
"--input",
|
| 383 |
+
required=True,
|
| 384 |
+
help="Path to image file or directory",
|
| 385 |
+
)
|
| 386 |
+
parser.add_argument(
|
| 387 |
+
"--output",
|
| 388 |
+
default=None,
|
| 389 |
+
help="Output path (.h5 file or directory for area mode)",
|
| 390 |
+
)
|
| 391 |
+
parser.add_argument(
|
| 392 |
+
"--areas",
|
| 393 |
+
nargs="+",
|
| 394 |
+
default=None,
|
| 395 |
+
help="Area IDs for N-Benchmark extraction (e.g., 37 38 39 40)",
|
| 396 |
+
)
|
| 397 |
+
parser.add_argument(
|
| 398 |
+
"--year",
|
| 399 |
+
default=None,
|
| 400 |
+
help="Year label for area mode filenames (e.g., 2022)",
|
| 401 |
+
)
|
| 402 |
+
parser.add_argument(
|
| 403 |
+
"--batch-size",
|
| 404 |
+
type=int,
|
| 405 |
+
default=32,
|
| 406 |
+
help="Batch size for extraction (default: 32)",
|
| 407 |
+
)
|
| 408 |
+
parser.add_argument(
|
| 409 |
+
"--device",
|
| 410 |
+
default="cuda" if torch.cuda.is_available() else "cpu",
|
| 411 |
+
help="Device (default: cuda if available)",
|
| 412 |
+
)
|
| 413 |
+
return parser.parse_args()
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def main() -> None:
|
| 417 |
+
args = parse_args()
|
| 418 |
+
input_path = Path(args.input)
|
| 419 |
+
|
| 420 |
+
# Load model
|
| 421 |
+
model, config = load_model(args.model, args.device, config_path=args.config)
|
| 422 |
+
transform = get_inference_transforms(config.img_size)
|
| 423 |
+
|
| 424 |
+
# --- Single image mode ---
|
| 425 |
+
if input_path.is_file():
|
| 426 |
+
embedding = extract_single(model, input_path, transform, args.device)
|
| 427 |
+
if embedding is not None:
|
| 428 |
+
print(f"Image: {input_path.name}")
|
| 429 |
+
print(f"Embedding shape: {embedding.shape}")
|
| 430 |
+
print(f"Embedding norm: {np.linalg.norm(embedding):.4f}")
|
| 431 |
+
if args.output:
|
| 432 |
+
np.save(args.output, embedding)
|
| 433 |
+
logger.info(f"Saved embedding to {args.output}")
|
| 434 |
+
else:
|
| 435 |
+
print(f"Embedding: {embedding[:8]}... (first 8 dims)")
|
| 436 |
+
return
|
| 437 |
+
|
| 438 |
+
# --- Area mode (N-Benchmark style) ---
|
| 439 |
+
if args.areas:
|
| 440 |
+
output_dir = Path(args.output) if args.output else Path("features")
|
| 441 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 442 |
+
|
| 443 |
+
for area_id in args.areas:
|
| 444 |
+
area_dir = input_path / area_id
|
| 445 |
+
if not area_dir.exists():
|
| 446 |
+
logger.warning(f"Area directory not found: {area_dir}")
|
| 447 |
+
continue
|
| 448 |
+
|
| 449 |
+
features, names = extract_directory(
|
| 450 |
+
model, area_dir, transform, args.device, args.batch_size,
|
| 451 |
+
)
|
| 452 |
+
if len(features) > 0:
|
| 453 |
+
if args.year:
|
| 454 |
+
out_path = output_dir / f"features_{args.year}_{area_id}_whole.h5"
|
| 455 |
+
else:
|
| 456 |
+
out_path = output_dir / f"features_{area_id}_whole.h5"
|
| 457 |
+
save_features_h5(
|
| 458 |
+
out_path,
|
| 459 |
+
features,
|
| 460 |
+
names,
|
| 461 |
+
{"area_id": area_id, "source_dir": str(area_dir)},
|
| 462 |
+
)
|
| 463 |
+
return
|
| 464 |
+
|
| 465 |
+
# --- Directory mode ---
|
| 466 |
+
if input_path.is_dir():
|
| 467 |
+
features, names = extract_directory(
|
| 468 |
+
model, input_path, transform, args.device, args.batch_size,
|
| 469 |
+
)
|
| 470 |
+
if len(features) > 0:
|
| 471 |
+
output_path = args.output or "features.h5"
|
| 472 |
+
save_features_h5(
|
| 473 |
+
output_path,
|
| 474 |
+
features,
|
| 475 |
+
names,
|
| 476 |
+
{"source_dir": str(input_path)},
|
| 477 |
+
)
|
| 478 |
+
else:
|
| 479 |
+
logger.error("No features extracted")
|
| 480 |
+
return
|
| 481 |
+
|
| 482 |
+
logger.error(f"Input path not found: {input_path}")
|
| 483 |
+
|
| 484 |
+
|
| 485 |
+
if __name__ == "__main__":
|
| 486 |
+
main()
|
legacy/dinov2_coral_best_model_20251015_165008.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6eb998eb8a37c7a539a36c6b32485beb9ff3d8c1fca5d5aacfa8cb9aefbfd47b
|
| 3 |
+
size 354828824
|
legacy/dinov2_coral_best_model_20251016_133229.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3f8af48b28b22c23591f68a96144ce1fc4c3c597a5248476d59e5e0a2abaf26
|
| 3 |
+
size 354828824
|
pyproject.toml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "coral-models"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Standalone inference for coral re-identification models"
|
| 5 |
+
requires-python = ">=3.10"
|
| 6 |
+
license = "GPL-3.0-or-later"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"torch>=2.0.0",
|
| 9 |
+
"torchvision>=0.15.0",
|
| 10 |
+
"timm>=1.0.0",
|
| 11 |
+
"h5py>=3.9.0",
|
| 12 |
+
"numpy>=1.24.0",
|
| 13 |
+
"pillow>=10.0.0",
|
| 14 |
+
"pyyaml>=6.0",
|
| 15 |
+
"tqdm>=4.65.0",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[tool.uv]
|
| 19 |
+
package = false
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|