Image Segmentation
Transformers
Safetensors
remote-sensing
earth-observation
open-vocabulary
clip
sam3
semantic-segmentation
Instructions to use Dingyi111/SegEarth-OV with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Dingyi111/SegEarth-OV with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-segmentation", model="Dingyi111/SegEarth-OV")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Dingyi111/SegEarth-OV", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Commit ·
fabc606
0
Parent(s):
Duplicate from BiliSakura/SegEarth-OV
Browse filesCo-authored-by: Sakura <BiliSakura@users.noreply.huggingface.co>
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +13 -0
- OV-2/config.json +12 -0
- OV-2/configs/cls_chn6-cug.txt +2 -0
- OV-2/configs/cls_ddhrnet_korea_sar.txt +5 -0
- OV-2/configs/cls_ddhrnet_shandong_sar.txt +5 -0
- OV-2/configs/cls_ddhrnet_xian_sar.txt +5 -0
- OV-2/configs/cls_deepglobe.txt +7 -0
- OV-2/configs/cls_fusar.txt +5 -0
- OV-2/configs/cls_hrsid.txt +2 -0
- OV-2/configs/cls_iSAID.txt +16 -0
- OV-2/configs/cls_inria.txt +2 -0
- OV-2/configs/cls_loveda.txt +7 -0
- OV-2/configs/cls_massachusetts_building.txt +2 -0
- OV-2/configs/cls_openearthmap.txt +9 -0
- OV-2/configs/cls_openearthmap_sar.txt +9 -0
- OV-2/configs/cls_pie_sar.txt +6 -0
- OV-2/configs/cls_potsdam.txt +6 -0
- OV-2/configs/cls_roadval.txt +2 -0
- OV-2/configs/cls_uavid.txt +7 -0
- OV-2/configs/cls_udd5.txt +5 -0
- OV-2/configs/cls_vaihingen.txt +6 -0
- OV-2/configs/cls_vdd.txt +7 -0
- OV-2/configs/cls_wbs-si.txt +2 -0
- OV-2/configs/cls_whu.txt +2 -0
- OV-2/configs/cls_whu_sar.txt +7 -0
- OV-2/configs/cls_xBD.txt +2 -0
- OV-2/configs/cls_yeseg_sar.txt +6 -0
- OV-2/pipeline.py +38 -0
- OV-2/prompts/imagenet_template.py +97 -0
- OV-2/upsamplers.py +251 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/README.md +170 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/cls_openearthmap_sar.txt +9 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/config.json +42 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_ddhrnet_xian_sar.txt +5 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_fusar.txt +5 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_hrsid.txt +2 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_openearthmap_sar.txt +9 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_pie_sar.txt +6 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_whu_sar.txt +7 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_yeseg_sar.txt +6 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/conversion_meta.json +12 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/README.md +8 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/cvt_label.png +0 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/label.png +0 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/rgb.png +0 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/sar.png +0 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_result.png +3 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/eval.py +163 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/merges.txt +0 -0
- OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/model.safetensors +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_result.png filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/simfeatup/xclip_jbu_one_million_aid.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/simfeatup/clip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/simfeatup/maskclip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
OV-2/weights/featup/xclip_jbu_one_million_aid.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
OV-2/weights/featup/clip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
OV-2/weights/featup/maskclip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
OV-3/weights/backbone/sam3/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
OV/weights/backbone/clip-vit-base-patch16/model.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
OV/weights/featup/xclip_jbu_one_million_aid.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
OV/weights/featup/clip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
OV/weights/featup/maskclip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
|
OV-2/config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "OV-2",
|
| 3 |
+
"backbone": "AlignEarth",
|
| 4 |
+
"model_id": "BiliSakura/AlignEarth-SAR-ViT-B-16",
|
| 5 |
+
"vit_type": "ViT-B/16",
|
| 6 |
+
"featup": "jbu_one",
|
| 7 |
+
"featup_weights": "weights/featup/xclip_jbu_one_million_aid.ckpt",
|
| 8 |
+
"cls_token_lambda": -0.3,
|
| 9 |
+
"logit_scale": 50.0,
|
| 10 |
+
"notes": "SAR-adapted CLIP via knowledge distillation. Also supports openai/clip-vit-base-patch16.",
|
| 11 |
+
"local_backbone": "weights/backbone/AlignEarth-SAR-ViT-B-16"
|
| 12 |
+
}
|
OV-2/configs/cls_chn6-cug.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
road
|
OV-2/configs/cls_ddhrnet_korea_sar.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
building
|
| 2 |
+
road
|
| 3 |
+
greenery
|
| 4 |
+
water
|
| 5 |
+
farmland,grass
|
OV-2/configs/cls_ddhrnet_shandong_sar.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
farmland,grass
|
| 2 |
+
greenery
|
| 3 |
+
road
|
| 4 |
+
building
|
| 5 |
+
water
|
OV-2/configs/cls_ddhrnet_xian_sar.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
building
|
| 2 |
+
road
|
| 3 |
+
farmland,grass
|
| 4 |
+
greenery
|
| 5 |
+
water
|
OV-2/configs/cls_deepglobe.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
urban
|
| 2 |
+
agriculture
|
| 3 |
+
rangeland
|
| 4 |
+
forest
|
| 5 |
+
water
|
| 6 |
+
barren
|
| 7 |
+
background
|
OV-2/configs/cls_fusar.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
others
|
| 2 |
+
water
|
| 3 |
+
road
|
| 4 |
+
building
|
| 5 |
+
grass
|
OV-2/configs/cls_hrsid.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
ship,boat
|
OV-2/configs/cls_iSAID.txt
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
ship
|
| 3 |
+
store tank
|
| 4 |
+
baseball diamond
|
| 5 |
+
tennis court
|
| 6 |
+
basketball court
|
| 7 |
+
ground track field
|
| 8 |
+
bridge
|
| 9 |
+
large vehicle
|
| 10 |
+
small vehicle
|
| 11 |
+
helicopter
|
| 12 |
+
swimming pool
|
| 13 |
+
roundabout
|
| 14 |
+
soccer ball field
|
| 15 |
+
plane
|
| 16 |
+
harbor
|
OV-2/configs/cls_inria.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
building
|
OV-2/configs/cls_loveda.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
building,roof,house
|
| 3 |
+
road
|
| 4 |
+
water
|
| 5 |
+
barren
|
| 6 |
+
forest
|
| 7 |
+
agricultural
|
OV-2/configs/cls_massachusetts_building.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
building
|
OV-2/configs/cls_openearthmap.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
bareland,barren
|
| 3 |
+
grass
|
| 4 |
+
pavement
|
| 5 |
+
road
|
| 6 |
+
tree,forest
|
| 7 |
+
water,river
|
| 8 |
+
cropland
|
| 9 |
+
building,roof,house
|
OV-2/configs/cls_openearthmap_sar.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
bareland,barren
|
| 3 |
+
grass
|
| 4 |
+
pavement
|
| 5 |
+
road
|
| 6 |
+
tree,forest
|
| 7 |
+
water,river
|
| 8 |
+
cropland
|
| 9 |
+
building,roof,house
|
OV-2/configs/cls_pie_sar.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
city
|
| 3 |
+
road
|
| 4 |
+
water,river
|
| 5 |
+
forest
|
| 6 |
+
cropland
|
OV-2/configs/cls_potsdam.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
road
|
| 2 |
+
building
|
| 3 |
+
grass
|
| 4 |
+
tree
|
| 5 |
+
car
|
| 6 |
+
clutter,background
|
OV-2/configs/cls_roadval.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
road
|
OV-2/configs/cls_uavid.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
building
|
| 3 |
+
road
|
| 4 |
+
car
|
| 5 |
+
tree
|
| 6 |
+
vegetation
|
| 7 |
+
human
|
OV-2/configs/cls_udd5.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vegetation
|
| 2 |
+
building
|
| 3 |
+
road
|
| 4 |
+
vehicle
|
| 5 |
+
background
|
OV-2/configs/cls_vaihingen.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
road
|
| 2 |
+
building
|
| 3 |
+
grass
|
| 4 |
+
tree
|
| 5 |
+
car
|
| 6 |
+
clutter
|
OV-2/configs/cls_vdd.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
facade
|
| 3 |
+
road
|
| 4 |
+
vegetation
|
| 5 |
+
vehicle
|
| 6 |
+
roof
|
| 7 |
+
water
|
OV-2/configs/cls_wbs-si.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
water
|
OV-2/configs/cls_whu.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
building
|
OV-2/configs/cls_whu_sar.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
farmland
|
| 2 |
+
city
|
| 3 |
+
village
|
| 4 |
+
water
|
| 5 |
+
forest
|
| 6 |
+
road
|
| 7 |
+
others
|
OV-2/configs/cls_xBD.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
building
|
OV-2/configs/cls_yeseg_sar.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
bareground,barren
|
| 2 |
+
grass,farmland
|
| 3 |
+
dense tree cover
|
| 4 |
+
city
|
| 5 |
+
water
|
| 6 |
+
roadway
|
OV-2/pipeline.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SegEarth OV-2 pipeline. Self-contained; uses config.json and weights/featup/ in this folder.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
_parent = Path(__file__).resolve().parent.parent
|
| 9 |
+
if str(_parent) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(_parent))
|
| 11 |
+
from pipeline import SegEarthPipelineCLIP
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def load(config_path: Path = None, model_id: str = None, **kwargs):
|
| 15 |
+
"""Load OV-2 pipeline with config from this folder."""
|
| 16 |
+
repo_dir = Path(__file__).parent
|
| 17 |
+
cfg_path = config_path or repo_dir / "config.json"
|
| 18 |
+
with open(cfg_path) as f:
|
| 19 |
+
cfg = json.load(f)
|
| 20 |
+
local_backbone = cfg.get("local_backbone")
|
| 21 |
+
if local_backbone:
|
| 22 |
+
local_path = repo_dir / local_backbone
|
| 23 |
+
if local_path.exists():
|
| 24 |
+
kwargs.setdefault("model_id", str(local_path))
|
| 25 |
+
if "model_id" not in kwargs:
|
| 26 |
+
kwargs.setdefault("model_id", model_id or cfg["model_id"])
|
| 27 |
+
kwargs.setdefault("featup_model", cfg.get("featup") or "jbu_one")
|
| 28 |
+
kwargs.setdefault("cls_token_lambda", cfg.get("cls_token_lambda", -0.3))
|
| 29 |
+
kwargs.setdefault("logit_scale", cfg.get("logit_scale", 50.0))
|
| 30 |
+
featup_name = (cfg.get("featup_weights") or "xclip_jbu_one_million_aid.ckpt").split("/")[-1]
|
| 31 |
+
local_featup = repo_dir / "weights" / "featup" / featup_name
|
| 32 |
+
if local_featup.exists():
|
| 33 |
+
kwargs.setdefault("featup_weights_path", local_featup)
|
| 34 |
+
kwargs.setdefault("class_names_path", repo_dir / "configs" / "cls_openearthmap_sar.txt")
|
| 35 |
+
return SegEarthPipelineCLIP(**kwargs)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
SegEarthPipeline = load
|
OV-2/prompts/imagenet_template.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text prompts for open-vocabulary segmentation.
|
| 3 |
+
From SegEarth-OV-2/prompts/imagenet_template.py
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
openai_imagenet_template = [
|
| 7 |
+
lambda c: f"a bad photo of a {c}.",
|
| 8 |
+
lambda c: f"a photo of many {c}.",
|
| 9 |
+
lambda c: f"a sculpture of a {c}.",
|
| 10 |
+
lambda c: f"a photo of the hard to see {c}.",
|
| 11 |
+
lambda c: f"a low resolution photo of the {c}.",
|
| 12 |
+
lambda c: f"a rendering of a {c}.",
|
| 13 |
+
lambda c: f"graffiti of a {c}.",
|
| 14 |
+
lambda c: f"a bad photo of the {c}.",
|
| 15 |
+
lambda c: f"a cropped photo of the {c}.",
|
| 16 |
+
lambda c: f"a tattoo of a {c}.",
|
| 17 |
+
lambda c: f"the embroidered {c}.",
|
| 18 |
+
lambda c: f"a photo of a hard to see {c}.",
|
| 19 |
+
lambda c: f"a bright photo of a {c}.",
|
| 20 |
+
lambda c: f"a photo of a clean {c}.",
|
| 21 |
+
lambda c: f"a photo of a dirty {c}.",
|
| 22 |
+
lambda c: f"a dark photo of the {c}.",
|
| 23 |
+
lambda c: f"a drawing of a {c}.",
|
| 24 |
+
lambda c: f"a photo of my {c}.",
|
| 25 |
+
lambda c: f"the plastic {c}.",
|
| 26 |
+
lambda c: f"a photo of the cool {c}.",
|
| 27 |
+
lambda c: f"a close-up photo of a {c}.",
|
| 28 |
+
lambda c: f"a black and white photo of the {c}.",
|
| 29 |
+
lambda c: f"a painting of the {c}.",
|
| 30 |
+
lambda c: f"a painting of a {c}.",
|
| 31 |
+
lambda c: f"a pixelated photo of the {c}.",
|
| 32 |
+
lambda c: f"a sculpture of the {c}.",
|
| 33 |
+
lambda c: f"a bright photo of the {c}.",
|
| 34 |
+
lambda c: f"a cropped photo of a {c}.",
|
| 35 |
+
lambda c: f"a plastic {c}.",
|
| 36 |
+
lambda c: f"a photo of the dirty {c}.",
|
| 37 |
+
lambda c: f"a jpeg corrupted photo of a {c}.",
|
| 38 |
+
lambda c: f"a blurry photo of the {c}.",
|
| 39 |
+
lambda c: f"a photo of the {c}.",
|
| 40 |
+
lambda c: f"a good photo of the {c}.",
|
| 41 |
+
lambda c: f"a rendering of the {c}.",
|
| 42 |
+
lambda c: f"a {c} in a video game.",
|
| 43 |
+
lambda c: f"a photo of one {c}.",
|
| 44 |
+
lambda c: f"a doodle of a {c}.",
|
| 45 |
+
lambda c: f"a close-up photo of the {c}.",
|
| 46 |
+
lambda c: f"a photo of a {c}.",
|
| 47 |
+
lambda c: f"the origami {c}.",
|
| 48 |
+
lambda c: f"the {c} in a video game.",
|
| 49 |
+
lambda c: f"a sketch of a {c}.",
|
| 50 |
+
lambda c: f"a doodle of the {c}.",
|
| 51 |
+
lambda c: f"a origami {c}.",
|
| 52 |
+
lambda c: f"a low resolution photo of a {c}.",
|
| 53 |
+
lambda c: f"the toy {c}.",
|
| 54 |
+
lambda c: f"a rendition of the {c}.",
|
| 55 |
+
lambda c: f"a photo of the clean {c}.",
|
| 56 |
+
lambda c: f"a photo of a large {c}.",
|
| 57 |
+
lambda c: f"a rendition of a {c}.",
|
| 58 |
+
lambda c: f"a photo of a nice {c}.",
|
| 59 |
+
lambda c: f"a photo of a weird {c}.",
|
| 60 |
+
lambda c: f"a blurry photo of a {c}.",
|
| 61 |
+
lambda c: f"a cartoon {c}.",
|
| 62 |
+
lambda c: f"art of a {c}.",
|
| 63 |
+
lambda c: f"a sketch of the {c}.",
|
| 64 |
+
lambda c: f"a embroidered {c}.",
|
| 65 |
+
lambda c: f"a pixelated photo of a {c}.",
|
| 66 |
+
lambda c: f"itap of the {c}.",
|
| 67 |
+
lambda c: f"a jpeg corrupted photo of the {c}.",
|
| 68 |
+
lambda c: f"a good photo of a {c}.",
|
| 69 |
+
lambda c: f"a plushie {c}.",
|
| 70 |
+
lambda c: f"a photo of the nice {c}.",
|
| 71 |
+
lambda c: f"a photo of the small {c}.",
|
| 72 |
+
lambda c: f"a photo of the weird {c}.",
|
| 73 |
+
lambda c: f"the cartoon {c}.",
|
| 74 |
+
lambda c: f"art of the {c}.",
|
| 75 |
+
lambda c: f"a drawing of the {c}.",
|
| 76 |
+
lambda c: f"a photo of the large {c}.",
|
| 77 |
+
lambda c: f"a black and white photo of a {c}.",
|
| 78 |
+
lambda c: f"the plushie {c}.",
|
| 79 |
+
lambda c: f"a dark photo of a {c}.",
|
| 80 |
+
lambda c: f"itap of a {c}.",
|
| 81 |
+
lambda c: f"graffiti of the {c}.",
|
| 82 |
+
lambda c: f"a toy {c}.",
|
| 83 |
+
lambda c: f"itap of my {c}.",
|
| 84 |
+
lambda c: f"a photo of a cool {c}.",
|
| 85 |
+
lambda c: f"a photo of a small {c}.",
|
| 86 |
+
lambda c: f"a tattoo of the {c}.",
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
sub_imagenet_template = [
|
| 90 |
+
lambda c: f"itap of a {c}.",
|
| 91 |
+
lambda c: f"a bad photo of a {c}.",
|
| 92 |
+
lambda c: f"a origami {c}.",
|
| 93 |
+
lambda c: f"a photo of the large {c}.",
|
| 94 |
+
lambda c: f"a {c} in a video game.",
|
| 95 |
+
lambda c: f"art of the {c}.",
|
| 96 |
+
lambda c: f"a photo of the small {c}.",
|
| 97 |
+
]
|
OV-2/upsamplers.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SimFeatUp upsamplers for dense feature restoration.
|
| 3 |
+
From SegEarth-OV/OV-2 simfeatup_dev. Used by CLIP-based variants (OV, OV-2).
|
| 4 |
+
"""
|
| 5 |
+
import math
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from featup.adaptive_conv_cuda.adaptive_conv import AdaptiveConv
|
| 14 |
+
except Exception:
|
| 15 |
+
AdaptiveConv = None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def adaptive_conv_py_simple(input, filters):
|
| 19 |
+
"""Pure PyTorch fallback when featup CUDA is unavailable."""
|
| 20 |
+
b, c, h1, w1 = input.shape
|
| 21 |
+
b, h2, w2, f1, f2 = filters.shape
|
| 22 |
+
assert f1 == f2
|
| 23 |
+
t_filters = filters.reshape(b, h2, w2, f1 * f2)
|
| 24 |
+
patches = torch.nn.Unfold(f1)(input).view((b, c, f1 * f2, h2, w2))
|
| 25 |
+
return torch.einsum("bhwf,bcfhw->bchw", t_filters, patches)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _meshgrid(device, diameter):
|
| 29 |
+
dist_range = torch.linspace(-1, 1, diameter, device=device)
|
| 30 |
+
x, y = torch.meshgrid(dist_range, dist_range, indexing="ij")
|
| 31 |
+
return torch.cat([x.unsqueeze(0), y.unsqueeze(0)], dim=0)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Bilinear(torch.nn.Module):
|
| 35 |
+
def forward(self, source, guidance):
|
| 36 |
+
_, _, h, w = guidance.shape
|
| 37 |
+
return F.interpolate(source, (h, w), mode="bilinear")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class LayeredResizeConv(torch.nn.Module):
|
| 41 |
+
def __init__(self, dim, kernel_size=1, *args, **kwargs):
|
| 42 |
+
super().__init__(*args, **kwargs)
|
| 43 |
+
self.conv1 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
|
| 44 |
+
self.conv2 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
|
| 45 |
+
self.conv3 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
|
| 46 |
+
self.conv4 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
|
| 47 |
+
|
| 48 |
+
def apply_conv(self, source, guidance, conv, activation):
|
| 49 |
+
big_source = F.interpolate(source, scale_factor=2, mode="bilinear")
|
| 50 |
+
_, _, h, w = big_source.shape
|
| 51 |
+
small_guidance = F.interpolate(guidance, (h, w), mode="bilinear")
|
| 52 |
+
output = activation(conv(torch.cat([big_source, small_guidance], dim=1)))
|
| 53 |
+
return big_source + output
|
| 54 |
+
|
| 55 |
+
def forward(self, source, guidance):
|
| 56 |
+
source_2 = self.apply_conv(source, guidance, self.conv1, F.relu)
|
| 57 |
+
source_4 = self.apply_conv(source_2, guidance, self.conv2, F.relu)
|
| 58 |
+
source_8 = self.apply_conv(source_4, guidance, self.conv3, F.relu)
|
| 59 |
+
source_16 = self.apply_conv(source_8, guidance, self.conv4, lambda x: x)
|
| 60 |
+
return source_16
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class SimpleImplicitFeaturizer(torch.nn.Module):
|
| 64 |
+
def __init__(self, n_freqs=20):
|
| 65 |
+
super().__init__()
|
| 66 |
+
self.n_freqs = n_freqs
|
| 67 |
+
self.dim_multiplier = 2
|
| 68 |
+
|
| 69 |
+
def forward(self, x):
|
| 70 |
+
b, c, h, w = x.shape
|
| 71 |
+
dtype = x.dtype
|
| 72 |
+
grid_h = torch.linspace(-1, 1, h, device=x.device, dtype=dtype)
|
| 73 |
+
grid_w = torch.linspace(-1, 1, w, device=x.device, dtype=dtype)
|
| 74 |
+
feats = torch.stack(torch.meshgrid(grid_h, grid_w, indexing="ij")).unsqueeze(0)
|
| 75 |
+
feats = feats.broadcast_to((b, feats.shape[1], h, w))
|
| 76 |
+
freqs = torch.exp(torch.linspace(-2, 10, self.n_freqs, device=x.device)).to(dtype).reshape(
|
| 77 |
+
1, self.n_freqs, 1, 1, 1
|
| 78 |
+
)
|
| 79 |
+
feats = (feats.unsqueeze(1) * freqs).reshape(b, self.n_freqs * self.dim_multiplier, h, w)
|
| 80 |
+
return torch.cat([torch.sin(feats), torch.cos(feats), x], dim=1)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class IFA(torch.nn.Module):
|
| 84 |
+
def __init__(self, feat_dim, num_scales=20):
|
| 85 |
+
super().__init__()
|
| 86 |
+
self.feat_dim = feat_dim
|
| 87 |
+
self.sin_feats = SimpleImplicitFeaturizer()
|
| 88 |
+
self.mlp = nn.Sequential(
|
| 89 |
+
nn.Conv2d(feat_dim + (num_scales * 4) + 2, feat_dim, 1),
|
| 90 |
+
nn.BatchNorm2d(feat_dim),
|
| 91 |
+
nn.LeakyReLU(),
|
| 92 |
+
nn.Conv2d(feat_dim, feat_dim, 1),
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def _upsample_2x(self, source):
|
| 96 |
+
b, c, h, w = source.shape
|
| 97 |
+
dtype = source.dtype
|
| 98 |
+
up_source = F.interpolate(source, (h * 2, w * 2), mode="nearest")
|
| 99 |
+
lr_cord = torch.linspace(0, h, steps=h, device=source.device, dtype=dtype)
|
| 100 |
+
hr_cord = torch.linspace(0, h, steps=2 * h, device=source.device, dtype=dtype)
|
| 101 |
+
lr_coords = torch.stack(torch.meshgrid(lr_cord, lr_cord, indexing="ij")).unsqueeze(0)
|
| 102 |
+
hr_coords = torch.stack(torch.meshgrid(hr_cord, hr_cord, indexing="ij")).unsqueeze(0)
|
| 103 |
+
up_lr_coords = F.interpolate(lr_coords, (h * 2, w * 2), mode="nearest")
|
| 104 |
+
coord_diff = up_lr_coords - hr_coords
|
| 105 |
+
coord_diff_feats = self.sin_feats(coord_diff).to(dtype)
|
| 106 |
+
bcast_coord_feats = coord_diff_feats.broadcast_to((b, coord_diff_feats.shape[1], h * 2, w * 2))
|
| 107 |
+
return self.mlp(torch.cat([up_source, bcast_coord_feats], dim=1))
|
| 108 |
+
|
| 109 |
+
def forward(self, source, guidance):
|
| 110 |
+
_, _, gh, gw = guidance.shape
|
| 111 |
+
x = source
|
| 112 |
+
while x.shape[2] < gh or x.shape[3] < gw:
|
| 113 |
+
x = self._upsample_2x(x)
|
| 114 |
+
if x.shape[2] != gh or x.shape[3] != gw:
|
| 115 |
+
x = F.interpolate(x, (gh, gw), mode="bilinear")
|
| 116 |
+
return x
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class JBULearnedRange(torch.nn.Module):
|
| 120 |
+
def __init__(self, guidance_dim, feat_dim, key_dim, scale=2, radius=3):
|
| 121 |
+
super().__init__()
|
| 122 |
+
self.scale = scale
|
| 123 |
+
self.radius = radius
|
| 124 |
+
self.diameter = self.radius * 2 + 1
|
| 125 |
+
self.guidance_dim = guidance_dim
|
| 126 |
+
self.key_dim = key_dim
|
| 127 |
+
self.feat_dim = feat_dim
|
| 128 |
+
self.range_temp = nn.Parameter(torch.tensor(0.0))
|
| 129 |
+
self.range_proj = nn.Sequential(
|
| 130 |
+
nn.Conv2d(guidance_dim, key_dim, 1, 1),
|
| 131 |
+
nn.GELU(),
|
| 132 |
+
nn.Dropout2d(0.1),
|
| 133 |
+
nn.Conv2d(key_dim, key_dim, 1, 1),
|
| 134 |
+
)
|
| 135 |
+
self.fixup_proj = nn.Sequential(
|
| 136 |
+
nn.Conv2d(guidance_dim + self.diameter ** 2, self.diameter ** 2, 1, 1),
|
| 137 |
+
nn.GELU(),
|
| 138 |
+
nn.Dropout2d(0.1),
|
| 139 |
+
nn.Conv2d(self.diameter ** 2, self.diameter ** 2, 1, 1),
|
| 140 |
+
)
|
| 141 |
+
self.sigma_spatial = nn.Parameter(torch.tensor(1.0))
|
| 142 |
+
|
| 143 |
+
def get_range_kernel(self, x):
|
| 144 |
+
GB, GC, GH, GW = x.shape
|
| 145 |
+
proj_x = self.range_proj(x)
|
| 146 |
+
proj_x_padded = F.pad(proj_x, pad=[self.radius] * 4, mode="reflect")
|
| 147 |
+
queries = (
|
| 148 |
+
torch.nn.Unfold(self.diameter)(proj_x_padded)
|
| 149 |
+
.reshape((GB, self.key_dim, self.diameter * self.diameter, GH, GW))
|
| 150 |
+
.permute(0, 1, 3, 4, 2)
|
| 151 |
+
)
|
| 152 |
+
pos_temp = self.range_temp.exp().clamp_min(1e-4).clamp_max(1e4)
|
| 153 |
+
return F.softmax(pos_temp * torch.einsum("bchwp,bchw->bphw", queries, proj_x), dim=1)
|
| 154 |
+
|
| 155 |
+
def get_spatial_kernel(self, device):
|
| 156 |
+
patch = _meshgrid(device, self.diameter)
|
| 157 |
+
return torch.exp(-patch.square().sum(0) / (2 * self.sigma_spatial ** 2)).reshape(
|
| 158 |
+
1, self.diameter * self.diameter, 1, 1
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
def forward(self, source, guidance):
|
| 162 |
+
GB, GC, GH, GW = guidance.shape
|
| 163 |
+
SB, SC, SH, SQ = source.shape
|
| 164 |
+
assert SB == GB
|
| 165 |
+
dtype = source.dtype
|
| 166 |
+
guidance = guidance.to(dtype)
|
| 167 |
+
spatial_kernel = self.get_spatial_kernel(source.device).to(dtype)
|
| 168 |
+
range_kernel = self.get_range_kernel(guidance).to(dtype)
|
| 169 |
+
combined_kernel = (range_kernel * spatial_kernel).to(dtype)
|
| 170 |
+
combined_kernel /= combined_kernel.sum(1, keepdim=True).clamp(1e-7)
|
| 171 |
+
combined_kernel += 0.1 * self.fixup_proj(torch.cat([combined_kernel, guidance], dim=1))
|
| 172 |
+
combined_kernel = combined_kernel.permute(0, 2, 3, 1).reshape(
|
| 173 |
+
GB, GH, GW, self.diameter, self.diameter
|
| 174 |
+
)
|
| 175 |
+
hr_source = F.interpolate(source, size=(GH, GW), mode="bicubic", align_corners=False)
|
| 176 |
+
hr_source_padded = F.pad(hr_source, pad=[self.radius] * 4, mode="reflect")
|
| 177 |
+
combined_kernel = combined_kernel.to(hr_source_padded.dtype)
|
| 178 |
+
if AdaptiveConv is not None:
|
| 179 |
+
result = AdaptiveConv.apply(hr_source_padded, combined_kernel)
|
| 180 |
+
else:
|
| 181 |
+
result = adaptive_conv_py_simple(hr_source_padded, combined_kernel)
|
| 182 |
+
return result
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
class JBUStack(torch.nn.Module):
|
| 186 |
+
def __init__(self, feat_dim, *args, **kwargs):
|
| 187 |
+
super().__init__(*args, **kwargs)
|
| 188 |
+
self.up1 = JBULearnedRange(3, feat_dim, 32, radius=3)
|
| 189 |
+
self.up2 = JBULearnedRange(3, feat_dim, 32, radius=3)
|
| 190 |
+
self.up3 = JBULearnedRange(3, feat_dim, 32, radius=3)
|
| 191 |
+
self.up4 = JBULearnedRange(3, feat_dim, 32, radius=3)
|
| 192 |
+
self.fixup_proj = nn.Sequential(
|
| 193 |
+
nn.Dropout2d(0.2),
|
| 194 |
+
nn.Conv2d(feat_dim, feat_dim, kernel_size=1),
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
def upsample(self, source, guidance, up):
|
| 198 |
+
_, _, h, w = source.shape
|
| 199 |
+
small_guidance = F.adaptive_avg_pool2d(guidance, (h * 2, w * 2))
|
| 200 |
+
return up(source, small_guidance)
|
| 201 |
+
|
| 202 |
+
def forward(self, source, guidance):
|
| 203 |
+
source_2 = self.upsample(source, guidance, self.up1)
|
| 204 |
+
source_4 = self.upsample(source_2, guidance, self.up2)
|
| 205 |
+
source_8 = self.upsample(source_4, guidance, self.up3)
|
| 206 |
+
source_16 = self.upsample(source_8, guidance, self.up4)
|
| 207 |
+
return self.fixup_proj(source_16) * 0.1 + source_16
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
class JBUOne(torch.nn.Module):
|
| 211 |
+
def __init__(self, feat_dim, *args, **kwargs):
|
| 212 |
+
super().__init__(*args, **kwargs)
|
| 213 |
+
self.up = JBULearnedRange(3, feat_dim, 32, radius=5)
|
| 214 |
+
self.fixup_proj = nn.Sequential(
|
| 215 |
+
nn.Dropout2d(0.2),
|
| 216 |
+
nn.Conv2d(feat_dim, feat_dim, kernel_size=1),
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
def upsample(self, source, guidance, up):
|
| 220 |
+
_, _, h, w = source.shape
|
| 221 |
+
small_guidance = F.adaptive_avg_pool2d(guidance, (h * 2, w * 2))
|
| 222 |
+
return up(source, small_guidance)
|
| 223 |
+
|
| 224 |
+
def forward(self, source, guidance):
|
| 225 |
+
source_2 = self.upsample(source, guidance, self.up)
|
| 226 |
+
source_4 = self.upsample(source_2, guidance, self.up)
|
| 227 |
+
source_8 = self.upsample(source_4, guidance, self.up)
|
| 228 |
+
source_16 = self.upsample(source_8, guidance, self.up)
|
| 229 |
+
return self.fixup_proj(source_16) * 0.1 + source_16
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
FEATUP_CHECKPOINTS = {
|
| 233 |
+
"jbu_one": "simfeatup/xclip_jbu_one_million_aid.ckpt",
|
| 234 |
+
"jbu_stack": "simfeatup/clip_jbu_stack_cocostuff.ckpt",
|
| 235 |
+
"jbu_stack_maskclip": "simfeatup/maskclip_jbu_stack_cocostuff.ckpt",
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def get_upsampler(name: str, feat_dim: int):
|
| 240 |
+
if name == "bilinear":
|
| 241 |
+
return Bilinear()
|
| 242 |
+
elif name == "jbu_one":
|
| 243 |
+
return JBUOne(feat_dim)
|
| 244 |
+
elif name == "jbu_stack":
|
| 245 |
+
return JBUStack(feat_dim)
|
| 246 |
+
elif name == "resize_conv":
|
| 247 |
+
return LayeredResizeConv(feat_dim, 1)
|
| 248 |
+
elif name == "ifa":
|
| 249 |
+
return IFA(feat_dim)
|
| 250 |
+
else:
|
| 251 |
+
raise ValueError(f"Unknown upsampler: {name}. Use: bilinear, jbu_one, jbu_stack, resize_conv, ifa")
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/README.md
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
pipeline_tag: image-segmentation
|
| 4 |
+
tags:
|
| 5 |
+
- clip
|
| 6 |
+
- vision
|
| 7 |
+
- remote-sensing
|
| 8 |
+
- sar
|
| 9 |
+
- earth-observation
|
| 10 |
+
- open-vocabulary
|
| 11 |
+
library_name: transformers
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# AlignEarth-SAR-ViT-B-16
|
| 15 |
+
|
| 16 |
+
CLIP-style vision-language model adapted for **Synthetic Aperture Radar (SAR)** imagery via knowledge distillation from optical VLMs. Enables open-vocabulary semantic segmentation for SAR remote sensing without building SAR foundation models from scratch.
|
| 17 |
+
|
| 18 |
+
This repository provides the model in **Hugging Face Transformers** format, converted from the original [OpenCLIP-style checkpoint](https://huggingface.co/likyoo/AlignEarth-SAR-ViT-B-16) released by the SegEarth-OV-2 authors.
|
| 19 |
+
|
| 20 |
+
## Model Details
|
| 21 |
+
|
| 22 |
+
- **Architecture**: CLIP (ViT-B/16 vision encoder + text encoder)
|
| 23 |
+
- **Vision**: 12-layer ViT, 768 hidden, 16×16 patches, 224×224 input
|
| 24 |
+
- **Text**: 12-layer transformer, 512 hidden, vocab 49408, max length 77
|
| 25 |
+
- **Projection**: 512-dim shared embedding space
|
| 26 |
+
- **Source**: [likyoo/AlignEarth-SAR-ViT-B-16](https://huggingface.co/likyoo/AlignEarth-SAR-ViT-B-16) (OpenCLIP format)
|
| 27 |
+
- **Conversion**: Mapped to `transformers.CLIPModel` for standard HF usage
|
| 28 |
+
- **SimFeatUp**: Full upsampler suite from [SegEarth-OV/simfeatup_dev](https://github.com/likyoo/SegEarth-OV/tree/main/simfeatup_dev):
|
| 29 |
+
- `jbu_one` → `simfeatup/xclip_jbu_one_million_aid.ckpt` (default, remote-sensing)
|
| 30 |
+
- `jbu_stack` → `simfeatup/clip_jbu_stack_cocostuff.ckpt`
|
| 31 |
+
- `jbu_stack_maskclip` → `simfeatup/maskclip_jbu_stack_cocostuff.ckpt`
|
| 32 |
+
- `bilinear`, `resize_conv`, `ifa` (no pretrained weights)
|
| 33 |
+
|
| 34 |
+
## Usage
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
from transformers import CLIPModel, CLIPProcessor
|
| 38 |
+
from PIL import Image
|
| 39 |
+
|
| 40 |
+
model = CLIPModel.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
|
| 41 |
+
processor = CLIPProcessor.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
|
| 42 |
+
|
| 43 |
+
image = Image.open("sar_image.tif")
|
| 44 |
+
texts = ["building", "road", "water body", "vegetation"]
|
| 45 |
+
|
| 46 |
+
inputs = processor(
|
| 47 |
+
text=texts,
|
| 48 |
+
images=image,
|
| 49 |
+
return_tensors="pt",
|
| 50 |
+
padding=True,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
outputs = model(**inputs)
|
| 54 |
+
logits_per_image = outputs.logits_per_image
|
| 55 |
+
probs = logits_per_image.softmax(dim=1)
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
For dense features (e.g., segmentation with SegEarth-OV-2), use the vision encoder:
|
| 59 |
+
|
| 60 |
+
```python
|
| 61 |
+
from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
|
| 62 |
+
|
| 63 |
+
vision_model = CLIPVisionModelWithProjection.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
|
| 64 |
+
processor = CLIPImageProcessor.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
|
| 65 |
+
|
| 66 |
+
inputs = processor(images=image, return_tensors="pt")
|
| 67 |
+
outputs = vision_model(**inputs)
|
| 68 |
+
image_embeds = outputs.image_embeds # pooled
|
| 69 |
+
# Or use vision_model.vision_model for patch-level features
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Full Pipeline (SegEarth-OV-2 Style)
|
| 73 |
+
|
| 74 |
+
For **open-vocabulary SAR segmentation** with SimFeatUp dense upsampling:
|
| 75 |
+
|
| 76 |
+
```python
|
| 77 |
+
from pathlib import Path
|
| 78 |
+
from PIL import Image
|
| 79 |
+
from pipeline import SegEarthPipeline
|
| 80 |
+
|
| 81 |
+
pipe = SegEarthPipeline(Path("BiliSakura/AlignEarth-SAR-ViT-B-16"))
|
| 82 |
+
image = Image.open("your_sar_image.tif").convert("RGB")
|
| 83 |
+
seg_map = pipe(image) # [H, W] class indices
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
The pipeline combines:
|
| 87 |
+
- **AlignEarth** CLIP encoder (SAR-adapted)
|
| 88 |
+
- **SimFeatUp** upsampler (choose `jbu_one`, `jbu_stack`, `jbu_stack_maskclip`, or `bilinear`)
|
| 89 |
+
- **Global Bias Alleviation** (cls_token_lambda) – subtracts global context from patch features
|
| 90 |
+
- **Logit scaling** and **prob threshold** for robust predictions
|
| 91 |
+
- **Sliding window** for large images
|
| 92 |
+
- OpenEarthMap SAR class names (customize via `cls_openearthmap_sar.txt` or `configs/cls_*.txt`)
|
| 93 |
+
|
| 94 |
+
```python
|
| 95 |
+
# Use different featup models
|
| 96 |
+
pipe = SegEarthPipeline(Path("."), featup_model="jbu_one") # default
|
| 97 |
+
pipe = SegEarthPipeline(Path("."), featup_model="jbu_stack")
|
| 98 |
+
pipe = SegEarthPipeline(Path("."), featup_model="jbu_stack_maskclip")
|
| 99 |
+
pipe = SegEarthPipeline(Path("."), featup_model="bilinear") # no weights
|
| 100 |
+
|
| 101 |
+
# Full SegEarth-OV-2 options
|
| 102 |
+
pipe = SegEarthPipeline(Path("."), cls_token_lambda=-0.3, logit_scale=50, prob_thd=0)
|
| 103 |
+
pipe = SegEarthPipeline(Path("."), slide_crop=224, slide_stride=112) # sliding window for large images
|
| 104 |
+
pipe = SegEarthPipeline(Path("."), class_names_path="configs/cls_whu_sar.txt") # different dataset
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
## Demo / Test
|
| 108 |
+
|
| 109 |
+
A paired demo sample from **YESeg-OPT-SAR** is in `demo_YESeg-OPT-SAR/`: `sar.png`, `rgb.png`, `label.png`. **Note**: This model targets SAR imagery, not optical.
|
| 110 |
+
|
| 111 |
+
```bash
|
| 112 |
+
python test_demo.py # uses demo_YESeg-OPT-SAR, cls_yeseg_sar, prob_thd=0.3
|
| 113 |
+
python test_demo.py --featup jbu_stack # try jbu_stack upsampler
|
| 114 |
+
python test_demo.py --save out.png # save figure
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
The script displays a matplotlib image grid: RGB | SAR | Label (GT) | Prediction.
|
| 118 |
+
|
| 119 |
+
## Evaluation
|
| 120 |
+
|
| 121 |
+
Standalone evaluation (no mmseg) on image/label pairs:
|
| 122 |
+
|
| 123 |
+
```bash
|
| 124 |
+
python eval.py --img-dir data/OpenEarthMap_SAR/test/sar_images \\
|
| 125 |
+
--label-dir data/OpenEarthMap_SAR/test/labels \\
|
| 126 |
+
--config configs/cls_openearthmap_sar.txt
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
SAR class configs in `configs/`: `cls_openearthmap_sar.txt`, `cls_whu_sar.txt`, `cls_hrsid.txt`, `cls_pie_sar.txt`, `cls_fusar.txt`, `cls_yeseg_sar.txt`, `cls_ddhrnet_xian_sar.txt`.
|
| 130 |
+
|
| 131 |
+
Or from Python:
|
| 132 |
+
|
| 133 |
+
```python
|
| 134 |
+
from pathlib import Path
|
| 135 |
+
from pipeline import SegEarthPipeline
|
| 136 |
+
from PIL import Image
|
| 137 |
+
|
| 138 |
+
pipe = SegEarthPipeline(Path("."))
|
| 139 |
+
image = Image.open("demo/sar.png").convert("RGB")
|
| 140 |
+
seg = pipe(image)
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
## Citation
|
| 144 |
+
|
| 145 |
+
If you use this model, please cite the SegEarth-OV-2 paper:
|
| 146 |
+
|
| 147 |
+
```bibtex
|
| 148 |
+
@article{li2025segearthov2,
|
| 149 |
+
title={Annotation-Free Open-Vocabulary Segmentation for Remote-Sensing Images},
|
| 150 |
+
author={Li, Kaiyu and Cao, Xiangyong and Liu, Ruixun and Wang, Shihong and Jiang, Zixuan and Wang, Zhi and Meng, Deyu},
|
| 151 |
+
journal={arXiv preprint arXiv:2508.18067},
|
| 152 |
+
year={2025}
|
| 153 |
+
}
|
| 154 |
+
```
|
| 155 |
+
|
| 156 |
+
## License
|
| 157 |
+
|
| 158 |
+
MIT License (inherited from the original AlignEarth release).
|
| 159 |
+
|
| 160 |
+
## Dependencies
|
| 161 |
+
|
| 162 |
+
- `transformers`, `torch`, `torchvision`, `PIL`
|
| 163 |
+
- Optional: `featup` for CUDA-accelerated JBU (falls back to pure PyTorch)
|
| 164 |
+
- Optional: `mmcv` for CarafeUpsampler, `sapa` for SAPAUpsampler
|
| 165 |
+
|
| 166 |
+
## Related
|
| 167 |
+
|
| 168 |
+
- **Original weights**: [likyoo/AlignEarth-SAR-ViT-B-16](https://huggingface.co/likyoo/AlignEarth-SAR-ViT-B-16)
|
| 169 |
+
- **Code**: [SegEarth-OV-2](https://github.com/earth-insights/SegEarth-OV-2)
|
| 170 |
+
- **Paper**: [arXiv:2508.18067](https://arxiv.org/abs/2508.18067)
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/cls_openearthmap_sar.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
bareland,barren
|
| 3 |
+
grass
|
| 4 |
+
pavement
|
| 5 |
+
road
|
| 6 |
+
tree,forest
|
| 7 |
+
water,river
|
| 8 |
+
cropland
|
| 9 |
+
building,roof,house
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/config.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"CLIPModel"
|
| 4 |
+
],
|
| 5 |
+
"dtype": "float32",
|
| 6 |
+
"initializer_factor": 1.0,
|
| 7 |
+
"logit_scale_init_value": 2.6592,
|
| 8 |
+
"model_type": "clip",
|
| 9 |
+
"projection_dim": 512,
|
| 10 |
+
"text_config": {
|
| 11 |
+
"attention_dropout": 0.0,
|
| 12 |
+
"hidden_act": "quick_gelu",
|
| 13 |
+
"hidden_size": 512,
|
| 14 |
+
"initializer_factor": 1.0,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": 2048,
|
| 17 |
+
"layer_norm_eps": 1e-05,
|
| 18 |
+
"max_position_embeddings": 77,
|
| 19 |
+
"model_type": "clip_text_model",
|
| 20 |
+
"num_attention_heads": 8,
|
| 21 |
+
"num_hidden_layers": 12,
|
| 22 |
+
"projection_dim": 512,
|
| 23 |
+
"vocab_size": 49408
|
| 24 |
+
},
|
| 25 |
+
"transformers_version": "4.57.3",
|
| 26 |
+
"vision_config": {
|
| 27 |
+
"attention_dropout": 0.0,
|
| 28 |
+
"hidden_act": "quick_gelu",
|
| 29 |
+
"hidden_size": 768,
|
| 30 |
+
"image_size": 224,
|
| 31 |
+
"initializer_factor": 1.0,
|
| 32 |
+
"initializer_range": 0.02,
|
| 33 |
+
"intermediate_size": 3072,
|
| 34 |
+
"layer_norm_eps": 1e-05,
|
| 35 |
+
"model_type": "clip_vision_model",
|
| 36 |
+
"num_attention_heads": 12,
|
| 37 |
+
"num_channels": 3,
|
| 38 |
+
"num_hidden_layers": 12,
|
| 39 |
+
"patch_size": 16,
|
| 40 |
+
"projection_dim": 512
|
| 41 |
+
}
|
| 42 |
+
}
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_ddhrnet_xian_sar.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
building
|
| 2 |
+
road
|
| 3 |
+
farmland,grass
|
| 4 |
+
greenery
|
| 5 |
+
water
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_fusar.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
others
|
| 2 |
+
water
|
| 3 |
+
road
|
| 4 |
+
building
|
| 5 |
+
grass
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_hrsid.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
ship,boat
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_openearthmap_sar.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
bareland,barren
|
| 3 |
+
grass
|
| 4 |
+
pavement
|
| 5 |
+
road
|
| 6 |
+
tree,forest
|
| 7 |
+
water,river
|
| 8 |
+
cropland
|
| 9 |
+
building,roof,house
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_pie_sar.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
background
|
| 2 |
+
city
|
| 3 |
+
road
|
| 4 |
+
water,river
|
| 5 |
+
forest
|
| 6 |
+
cropland
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_whu_sar.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
farmland
|
| 2 |
+
city
|
| 3 |
+
village
|
| 4 |
+
water
|
| 5 |
+
forest
|
| 6 |
+
road
|
| 7 |
+
others
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_yeseg_sar.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
bareground,barren
|
| 2 |
+
grass,farmland
|
| 3 |
+
dense tree cover
|
| 4 |
+
city
|
| 5 |
+
water
|
| 6 |
+
roadway
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/conversion_meta.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"source_checkpoint": "/data/projects/models/hf_models/likyoo/AlignEarth-SAR-ViT-B-16/AlignEarth-SAR-ViT-B-16.pt",
|
| 3 |
+
"source_top_level_keys": [
|
| 4 |
+
"epoch",
|
| 5 |
+
"name",
|
| 6 |
+
"state_dict",
|
| 7 |
+
"optimizer",
|
| 8 |
+
"scaler"
|
| 9 |
+
],
|
| 10 |
+
"converted_to": "transformers.CLIPModel",
|
| 11 |
+
"notes": "Mapped OpenCLIP-style state_dict to HF CLIPModel key format (vision+text)."
|
| 12 |
+
}
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Demo sample from YESeg-OPT-SAR dataset
|
| 2 |
+
|
| 3 |
+
This folder contains a paired sample from the **YESeg-OPT-SAR** dataset:
|
| 4 |
+
- `sar.png` – SAR image (input)
|
| 5 |
+
- `rgb.png` – Optical RGB reference
|
| 6 |
+
- `label.png` / `cvt_label.png` – Ground truth segmentation
|
| 7 |
+
|
| 8 |
+
Use `configs/cls_yeseg_sar.txt` and `--prob-thd 0.3` when running on this demo.
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/cvt_label.png
ADDED
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/label.png
ADDED
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/rgb.png
ADDED
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/sar.png
ADDED
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_result.png
ADDED
|
Git LFS Details
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/eval.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Standalone evaluation script for AlignEarth segmentation.
|
| 3 |
+
Evaluates on datasets with image/label pairs. Computes mIoU.
|
| 4 |
+
Usage:
|
| 5 |
+
python eval.py --img-dir data/OpenEarthMap_SAR/test/sar_images \\
|
| 6 |
+
--label-dir data/OpenEarthMap_SAR/test/labels \\
|
| 7 |
+
--config configs/cls_openearthmap_sar.txt
|
| 8 |
+
"""
|
| 9 |
+
import argparse
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
from PIL import Image
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
except ImportError:
|
| 18 |
+
def tqdm(x, **kw):
|
| 19 |
+
return x
|
| 20 |
+
|
| 21 |
+
from pipeline import SegEarthPipeline
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def load_label(path: Path, reduce_zero_label: bool = False) -> np.ndarray:
|
| 25 |
+
"""Load label as index map. Handles PNG (index or RGB) and single-channel."""
|
| 26 |
+
arr = np.array(Image.open(path))
|
| 27 |
+
if arr.ndim == 3:
|
| 28 |
+
# RGB label: convert to index if needed (dataset-specific)
|
| 29 |
+
arr = arr[:, :, 0] if arr.shape[2] >= 1 else arr
|
| 30 |
+
if reduce_zero_label and arr.max() > 0:
|
| 31 |
+
arr = arr - 1
|
| 32 |
+
arr[arr < 0] = 255 # ignore
|
| 33 |
+
return arr.astype(np.int64)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def compute_iou(pred: np.ndarray, gt: np.ndarray, num_classes: int, ignore_index: int = 255) -> np.ndarray:
|
| 37 |
+
"""Per-class IoU. Returns array of length num_classes."""
|
| 38 |
+
ious = np.zeros(num_classes)
|
| 39 |
+
for c in range(num_classes):
|
| 40 |
+
pred_c = pred == c
|
| 41 |
+
gt_c = gt == c
|
| 42 |
+
if gt_c.sum() == 0:
|
| 43 |
+
ious[c] = np.nan
|
| 44 |
+
continue
|
| 45 |
+
intersection = (pred_c & gt_c).sum()
|
| 46 |
+
union = (pred_c | gt_c).sum()
|
| 47 |
+
if union == 0:
|
| 48 |
+
ious[c] = np.nan
|
| 49 |
+
else:
|
| 50 |
+
ious[c] = intersection / union
|
| 51 |
+
return ious
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def main():
|
| 55 |
+
parser = argparse.ArgumentParser()
|
| 56 |
+
parser.add_argument("--img-dir", required=True, help="Directory of images")
|
| 57 |
+
parser.add_argument("--label-dir", required=True, help="Directory of labels")
|
| 58 |
+
parser.add_argument("--config", default="configs/cls_openearthmap_sar.txt", help="Class config file")
|
| 59 |
+
parser.add_argument("--img-suffix", default=".png", help="Image suffix")
|
| 60 |
+
parser.add_argument("--label-suffix", default=".png", help="Label suffix")
|
| 61 |
+
parser.add_argument("--reduce-zero-label", action="store_true", help="Label 1..N -> 0..N-1")
|
| 62 |
+
parser.add_argument("--save-pred-dir", help="Save predictions to directory")
|
| 63 |
+
parser.add_argument("--featup", default="jbu_one", help="SimFeatUp model")
|
| 64 |
+
parser.add_argument("--slide-crop", type=int, default=0, help="Sliding window crop size (0=disabled)")
|
| 65 |
+
parser.add_argument("--slide-stride", type=int, default=112, help="Sliding window stride")
|
| 66 |
+
parser.add_argument("--cls-token-lambda", type=float, default=-0.3, help="Global Bias Alleviation")
|
| 67 |
+
parser.add_argument("--logit-scale", type=float, default=50.0, help="Softmax temperature")
|
| 68 |
+
parser.add_argument("--prob-thd", type=float, default=0.0, help="Low-confidence threshold")
|
| 69 |
+
parser.add_argument("--limit", type=int, default=0, help="Limit number of samples (0=all)")
|
| 70 |
+
args = parser.parse_args()
|
| 71 |
+
|
| 72 |
+
repo_dir = Path(__file__).resolve().parent
|
| 73 |
+
img_dir = Path(args.img_dir)
|
| 74 |
+
label_dir = Path(args.label_dir)
|
| 75 |
+
config_path = Path(args.config)
|
| 76 |
+
if not config_path.is_absolute():
|
| 77 |
+
config_path = repo_dir / config_path
|
| 78 |
+
|
| 79 |
+
if not img_dir.exists():
|
| 80 |
+
raise FileNotFoundError(f"Image dir not found: {img_dir}")
|
| 81 |
+
if not label_dir.exists():
|
| 82 |
+
raise FileNotFoundError(f"Label dir not found: {label_dir}")
|
| 83 |
+
if not config_path.exists():
|
| 84 |
+
raise FileNotFoundError(f"Config not found: {config_path}")
|
| 85 |
+
|
| 86 |
+
# Collect images
|
| 87 |
+
img_files = sorted(img_dir.glob(f"*{args.img_suffix}"))
|
| 88 |
+
if args.limit:
|
| 89 |
+
img_files = img_files[: args.limit]
|
| 90 |
+
|
| 91 |
+
if not img_files:
|
| 92 |
+
raise FileNotFoundError(f"No images in {img_dir}")
|
| 93 |
+
|
| 94 |
+
# Build pipeline
|
| 95 |
+
pipe = SegEarthPipeline(
|
| 96 |
+
repo_dir,
|
| 97 |
+
featup_model=args.featup,
|
| 98 |
+
class_names_path=config_path,
|
| 99 |
+
cls_token_lambda=args.cls_token_lambda,
|
| 100 |
+
logit_scale=args.logit_scale,
|
| 101 |
+
prob_thd=args.prob_thd,
|
| 102 |
+
slide_crop=args.slide_crop,
|
| 103 |
+
slide_stride=args.slide_stride,
|
| 104 |
+
device="cuda",
|
| 105 |
+
)
|
| 106 |
+
num_classes = pipe.num_classes
|
| 107 |
+
|
| 108 |
+
save_pred_dir = Path(args.save_pred_dir) if args.save_pred_dir else None
|
| 109 |
+
if save_pred_dir:
|
| 110 |
+
save_pred_dir.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
|
| 112 |
+
all_ious = []
|
| 113 |
+
for img_path in tqdm(img_files, desc="Evaluating"):
|
| 114 |
+
label_name = img_path.stem + args.label_suffix
|
| 115 |
+
label_path = label_dir / label_name
|
| 116 |
+
if not label_path.exists():
|
| 117 |
+
label_path = label_dir / (img_path.name.replace(args.img_suffix, args.label_suffix))
|
| 118 |
+
if not label_path.exists():
|
| 119 |
+
tqdm.write(f"Skipping {img_path.name}: no label")
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
img = Image.open(img_path).convert("RGB")
|
| 123 |
+
gt = load_label(label_path, reduce_zero_label=args.reduce_zero_label)
|
| 124 |
+
|
| 125 |
+
pred = pipe(img)
|
| 126 |
+
pred_np = pred.cpu().numpy()
|
| 127 |
+
|
| 128 |
+
# Resize pred to match gt if needed
|
| 129 |
+
if pred_np.shape != gt.shape:
|
| 130 |
+
from PIL import Image as PILImage
|
| 131 |
+
pred_pil = PILImage.fromarray(pred_np.astype(np.uint8))
|
| 132 |
+
pred_pil = pred_pil.resize((gt.shape[1], gt.shape[0]), PILImage.NEAREST)
|
| 133 |
+
pred_np = np.array(pred_pil)
|
| 134 |
+
|
| 135 |
+
# Mask ignore
|
| 136 |
+
valid = gt != 255
|
| 137 |
+
if valid.sum() == 0:
|
| 138 |
+
continue
|
| 139 |
+
pred_m = pred_np.copy()
|
| 140 |
+
pred_m[~valid] = 255
|
| 141 |
+
gt_m = gt.copy()
|
| 142 |
+
gt_m[~valid] = 255
|
| 143 |
+
|
| 144 |
+
ious = compute_iou(pred_m, gt_m, num_classes, ignore_index=255)
|
| 145 |
+
all_ious.append(ious)
|
| 146 |
+
|
| 147 |
+
if save_pred_dir:
|
| 148 |
+
out_path = save_pred_dir / (img_path.stem + "_pred.png")
|
| 149 |
+
Image.fromarray(pred_np.astype(np.uint8)).save(out_path)
|
| 150 |
+
|
| 151 |
+
if not all_ious:
|
| 152 |
+
print("No valid samples.")
|
| 153 |
+
return
|
| 154 |
+
|
| 155 |
+
all_ious = np.array(all_ious)
|
| 156 |
+
mean_iou = np.nanmean(all_ious, axis=0)
|
| 157 |
+
miou = np.nanmean(mean_iou)
|
| 158 |
+
print(f"mIoU: {miou:.4f}")
|
| 159 |
+
print("Per-class IoU:", mean_iou)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
main()
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:604f0111c635195ec1a723d6a256f476b2c272f330f186a8edeec9f81a4cb560
|
| 3 |
+
size 598530372
|