Dingyi111 BiliSakura commited on
Commit
fabc606
·
0 Parent(s):

Duplicate from BiliSakura/SegEarth-OV

Browse files

Co-authored-by: Sakura <BiliSakura@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +13 -0
  2. OV-2/config.json +12 -0
  3. OV-2/configs/cls_chn6-cug.txt +2 -0
  4. OV-2/configs/cls_ddhrnet_korea_sar.txt +5 -0
  5. OV-2/configs/cls_ddhrnet_shandong_sar.txt +5 -0
  6. OV-2/configs/cls_ddhrnet_xian_sar.txt +5 -0
  7. OV-2/configs/cls_deepglobe.txt +7 -0
  8. OV-2/configs/cls_fusar.txt +5 -0
  9. OV-2/configs/cls_hrsid.txt +2 -0
  10. OV-2/configs/cls_iSAID.txt +16 -0
  11. OV-2/configs/cls_inria.txt +2 -0
  12. OV-2/configs/cls_loveda.txt +7 -0
  13. OV-2/configs/cls_massachusetts_building.txt +2 -0
  14. OV-2/configs/cls_openearthmap.txt +9 -0
  15. OV-2/configs/cls_openearthmap_sar.txt +9 -0
  16. OV-2/configs/cls_pie_sar.txt +6 -0
  17. OV-2/configs/cls_potsdam.txt +6 -0
  18. OV-2/configs/cls_roadval.txt +2 -0
  19. OV-2/configs/cls_uavid.txt +7 -0
  20. OV-2/configs/cls_udd5.txt +5 -0
  21. OV-2/configs/cls_vaihingen.txt +6 -0
  22. OV-2/configs/cls_vdd.txt +7 -0
  23. OV-2/configs/cls_wbs-si.txt +2 -0
  24. OV-2/configs/cls_whu.txt +2 -0
  25. OV-2/configs/cls_whu_sar.txt +7 -0
  26. OV-2/configs/cls_xBD.txt +2 -0
  27. OV-2/configs/cls_yeseg_sar.txt +6 -0
  28. OV-2/pipeline.py +38 -0
  29. OV-2/prompts/imagenet_template.py +97 -0
  30. OV-2/upsamplers.py +251 -0
  31. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/README.md +170 -0
  32. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/cls_openearthmap_sar.txt +9 -0
  33. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/config.json +42 -0
  34. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_ddhrnet_xian_sar.txt +5 -0
  35. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_fusar.txt +5 -0
  36. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_hrsid.txt +2 -0
  37. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_openearthmap_sar.txt +9 -0
  38. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_pie_sar.txt +6 -0
  39. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_whu_sar.txt +7 -0
  40. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_yeseg_sar.txt +6 -0
  41. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/conversion_meta.json +12 -0
  42. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/README.md +8 -0
  43. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/cvt_label.png +0 -0
  44. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/label.png +0 -0
  45. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/rgb.png +0 -0
  46. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/sar.png +0 -0
  47. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_result.png +3 -0
  48. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/eval.py +163 -0
  49. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/merges.txt +0 -0
  50. OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/model.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/model.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_result.png filter=lfs diff=lfs merge=lfs -text
3
+ OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/simfeatup/xclip_jbu_one_million_aid.ckpt filter=lfs diff=lfs merge=lfs -text
4
+ OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/simfeatup/clip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
5
+ OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/simfeatup/maskclip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ OV-2/weights/featup/xclip_jbu_one_million_aid.ckpt filter=lfs diff=lfs merge=lfs -text
7
+ OV-2/weights/featup/clip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
8
+ OV-2/weights/featup/maskclip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
9
+ OV-3/weights/backbone/sam3/model.safetensors filter=lfs diff=lfs merge=lfs -text
10
+ OV/weights/backbone/clip-vit-base-patch16/model.safetensors filter=lfs diff=lfs merge=lfs -text
11
+ OV/weights/featup/xclip_jbu_one_million_aid.ckpt filter=lfs diff=lfs merge=lfs -text
12
+ OV/weights/featup/clip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
13
+ OV/weights/featup/maskclip_jbu_stack_cocostuff.ckpt filter=lfs diff=lfs merge=lfs -text
OV-2/config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "OV-2",
3
+ "backbone": "AlignEarth",
4
+ "model_id": "BiliSakura/AlignEarth-SAR-ViT-B-16",
5
+ "vit_type": "ViT-B/16",
6
+ "featup": "jbu_one",
7
+ "featup_weights": "weights/featup/xclip_jbu_one_million_aid.ckpt",
8
+ "cls_token_lambda": -0.3,
9
+ "logit_scale": 50.0,
10
+ "notes": "SAR-adapted CLIP via knowledge distillation. Also supports openai/clip-vit-base-patch16.",
11
+ "local_backbone": "weights/backbone/AlignEarth-SAR-ViT-B-16"
12
+ }
OV-2/configs/cls_chn6-cug.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ road
OV-2/configs/cls_ddhrnet_korea_sar.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ building
2
+ road
3
+ greenery
4
+ water
5
+ farmland,grass
OV-2/configs/cls_ddhrnet_shandong_sar.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ farmland,grass
2
+ greenery
3
+ road
4
+ building
5
+ water
OV-2/configs/cls_ddhrnet_xian_sar.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ building
2
+ road
3
+ farmland,grass
4
+ greenery
5
+ water
OV-2/configs/cls_deepglobe.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ urban
2
+ agriculture
3
+ rangeland
4
+ forest
5
+ water
6
+ barren
7
+ background
OV-2/configs/cls_fusar.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ others
2
+ water
3
+ road
4
+ building
5
+ grass
OV-2/configs/cls_hrsid.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ ship,boat
OV-2/configs/cls_iSAID.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ background
2
+ ship
3
+ store tank
4
+ baseball diamond
5
+ tennis court
6
+ basketball court
7
+ ground track field
8
+ bridge
9
+ large vehicle
10
+ small vehicle
11
+ helicopter
12
+ swimming pool
13
+ roundabout
14
+ soccer ball field
15
+ plane
16
+ harbor
OV-2/configs/cls_inria.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ building
OV-2/configs/cls_loveda.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ background
2
+ building,roof,house
3
+ road
4
+ water
5
+ barren
6
+ forest
7
+ agricultural
OV-2/configs/cls_massachusetts_building.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ building
OV-2/configs/cls_openearthmap.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ background
2
+ bareland,barren
3
+ grass
4
+ pavement
5
+ road
6
+ tree,forest
7
+ water,river
8
+ cropland
9
+ building,roof,house
OV-2/configs/cls_openearthmap_sar.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ background
2
+ bareland,barren
3
+ grass
4
+ pavement
5
+ road
6
+ tree,forest
7
+ water,river
8
+ cropland
9
+ building,roof,house
OV-2/configs/cls_pie_sar.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ background
2
+ city
3
+ road
4
+ water,river
5
+ forest
6
+ cropland
OV-2/configs/cls_potsdam.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ road
2
+ building
3
+ grass
4
+ tree
5
+ car
6
+ clutter,background
OV-2/configs/cls_roadval.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ road
OV-2/configs/cls_uavid.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ background
2
+ building
3
+ road
4
+ car
5
+ tree
6
+ vegetation
7
+ human
OV-2/configs/cls_udd5.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ vegetation
2
+ building
3
+ road
4
+ vehicle
5
+ background
OV-2/configs/cls_vaihingen.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ road
2
+ building
3
+ grass
4
+ tree
5
+ car
6
+ clutter
OV-2/configs/cls_vdd.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ background
2
+ facade
3
+ road
4
+ vegetation
5
+ vehicle
6
+ roof
7
+ water
OV-2/configs/cls_wbs-si.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ water
OV-2/configs/cls_whu.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ building
OV-2/configs/cls_whu_sar.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ farmland
2
+ city
3
+ village
4
+ water
5
+ forest
6
+ road
7
+ others
OV-2/configs/cls_xBD.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ building
OV-2/configs/cls_yeseg_sar.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ bareground,barren
2
+ grass,farmland
3
+ dense tree cover
4
+ city
5
+ water
6
+ roadway
OV-2/pipeline.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SegEarth OV-2 pipeline. Self-contained; uses config.json and weights/featup/ in this folder.
3
+ """
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import sys
8
+ _parent = Path(__file__).resolve().parent.parent
9
+ if str(_parent) not in sys.path:
10
+ sys.path.insert(0, str(_parent))
11
+ from pipeline import SegEarthPipelineCLIP
12
+
13
+
14
+ def load(config_path: Path = None, model_id: str = None, **kwargs):
15
+ """Load OV-2 pipeline with config from this folder."""
16
+ repo_dir = Path(__file__).parent
17
+ cfg_path = config_path or repo_dir / "config.json"
18
+ with open(cfg_path) as f:
19
+ cfg = json.load(f)
20
+ local_backbone = cfg.get("local_backbone")
21
+ if local_backbone:
22
+ local_path = repo_dir / local_backbone
23
+ if local_path.exists():
24
+ kwargs.setdefault("model_id", str(local_path))
25
+ if "model_id" not in kwargs:
26
+ kwargs.setdefault("model_id", model_id or cfg["model_id"])
27
+ kwargs.setdefault("featup_model", cfg.get("featup") or "jbu_one")
28
+ kwargs.setdefault("cls_token_lambda", cfg.get("cls_token_lambda", -0.3))
29
+ kwargs.setdefault("logit_scale", cfg.get("logit_scale", 50.0))
30
+ featup_name = (cfg.get("featup_weights") or "xclip_jbu_one_million_aid.ckpt").split("/")[-1]
31
+ local_featup = repo_dir / "weights" / "featup" / featup_name
32
+ if local_featup.exists():
33
+ kwargs.setdefault("featup_weights_path", local_featup)
34
+ kwargs.setdefault("class_names_path", repo_dir / "configs" / "cls_openearthmap_sar.txt")
35
+ return SegEarthPipelineCLIP(**kwargs)
36
+
37
+
38
+ SegEarthPipeline = load
OV-2/prompts/imagenet_template.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text prompts for open-vocabulary segmentation.
3
+ From SegEarth-OV-2/prompts/imagenet_template.py
4
+ """
5
+
6
+ openai_imagenet_template = [
7
+ lambda c: f"a bad photo of a {c}.",
8
+ lambda c: f"a photo of many {c}.",
9
+ lambda c: f"a sculpture of a {c}.",
10
+ lambda c: f"a photo of the hard to see {c}.",
11
+ lambda c: f"a low resolution photo of the {c}.",
12
+ lambda c: f"a rendering of a {c}.",
13
+ lambda c: f"graffiti of a {c}.",
14
+ lambda c: f"a bad photo of the {c}.",
15
+ lambda c: f"a cropped photo of the {c}.",
16
+ lambda c: f"a tattoo of a {c}.",
17
+ lambda c: f"the embroidered {c}.",
18
+ lambda c: f"a photo of a hard to see {c}.",
19
+ lambda c: f"a bright photo of a {c}.",
20
+ lambda c: f"a photo of a clean {c}.",
21
+ lambda c: f"a photo of a dirty {c}.",
22
+ lambda c: f"a dark photo of the {c}.",
23
+ lambda c: f"a drawing of a {c}.",
24
+ lambda c: f"a photo of my {c}.",
25
+ lambda c: f"the plastic {c}.",
26
+ lambda c: f"a photo of the cool {c}.",
27
+ lambda c: f"a close-up photo of a {c}.",
28
+ lambda c: f"a black and white photo of the {c}.",
29
+ lambda c: f"a painting of the {c}.",
30
+ lambda c: f"a painting of a {c}.",
31
+ lambda c: f"a pixelated photo of the {c}.",
32
+ lambda c: f"a sculpture of the {c}.",
33
+ lambda c: f"a bright photo of the {c}.",
34
+ lambda c: f"a cropped photo of a {c}.",
35
+ lambda c: f"a plastic {c}.",
36
+ lambda c: f"a photo of the dirty {c}.",
37
+ lambda c: f"a jpeg corrupted photo of a {c}.",
38
+ lambda c: f"a blurry photo of the {c}.",
39
+ lambda c: f"a photo of the {c}.",
40
+ lambda c: f"a good photo of the {c}.",
41
+ lambda c: f"a rendering of the {c}.",
42
+ lambda c: f"a {c} in a video game.",
43
+ lambda c: f"a photo of one {c}.",
44
+ lambda c: f"a doodle of a {c}.",
45
+ lambda c: f"a close-up photo of the {c}.",
46
+ lambda c: f"a photo of a {c}.",
47
+ lambda c: f"the origami {c}.",
48
+ lambda c: f"the {c} in a video game.",
49
+ lambda c: f"a sketch of a {c}.",
50
+ lambda c: f"a doodle of the {c}.",
51
+ lambda c: f"a origami {c}.",
52
+ lambda c: f"a low resolution photo of a {c}.",
53
+ lambda c: f"the toy {c}.",
54
+ lambda c: f"a rendition of the {c}.",
55
+ lambda c: f"a photo of the clean {c}.",
56
+ lambda c: f"a photo of a large {c}.",
57
+ lambda c: f"a rendition of a {c}.",
58
+ lambda c: f"a photo of a nice {c}.",
59
+ lambda c: f"a photo of a weird {c}.",
60
+ lambda c: f"a blurry photo of a {c}.",
61
+ lambda c: f"a cartoon {c}.",
62
+ lambda c: f"art of a {c}.",
63
+ lambda c: f"a sketch of the {c}.",
64
+ lambda c: f"a embroidered {c}.",
65
+ lambda c: f"a pixelated photo of a {c}.",
66
+ lambda c: f"itap of the {c}.",
67
+ lambda c: f"a jpeg corrupted photo of the {c}.",
68
+ lambda c: f"a good photo of a {c}.",
69
+ lambda c: f"a plushie {c}.",
70
+ lambda c: f"a photo of the nice {c}.",
71
+ lambda c: f"a photo of the small {c}.",
72
+ lambda c: f"a photo of the weird {c}.",
73
+ lambda c: f"the cartoon {c}.",
74
+ lambda c: f"art of the {c}.",
75
+ lambda c: f"a drawing of the {c}.",
76
+ lambda c: f"a photo of the large {c}.",
77
+ lambda c: f"a black and white photo of a {c}.",
78
+ lambda c: f"the plushie {c}.",
79
+ lambda c: f"a dark photo of a {c}.",
80
+ lambda c: f"itap of a {c}.",
81
+ lambda c: f"graffiti of the {c}.",
82
+ lambda c: f"a toy {c}.",
83
+ lambda c: f"itap of my {c}.",
84
+ lambda c: f"a photo of a cool {c}.",
85
+ lambda c: f"a photo of a small {c}.",
86
+ lambda c: f"a tattoo of the {c}.",
87
+ ]
88
+
89
+ sub_imagenet_template = [
90
+ lambda c: f"itap of a {c}.",
91
+ lambda c: f"a bad photo of a {c}.",
92
+ lambda c: f"a origami {c}.",
93
+ lambda c: f"a photo of the large {c}.",
94
+ lambda c: f"a {c} in a video game.",
95
+ lambda c: f"art of the {c}.",
96
+ lambda c: f"a photo of the small {c}.",
97
+ ]
OV-2/upsamplers.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SimFeatUp upsamplers for dense feature restoration.
3
+ From SegEarth-OV/OV-2 simfeatup_dev. Used by CLIP-based variants (OV, OV-2).
4
+ """
5
+ import math
6
+ from typing import Optional
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+ try:
13
+ from featup.adaptive_conv_cuda.adaptive_conv import AdaptiveConv
14
+ except Exception:
15
+ AdaptiveConv = None
16
+
17
+
18
+ def adaptive_conv_py_simple(input, filters):
19
+ """Pure PyTorch fallback when featup CUDA is unavailable."""
20
+ b, c, h1, w1 = input.shape
21
+ b, h2, w2, f1, f2 = filters.shape
22
+ assert f1 == f2
23
+ t_filters = filters.reshape(b, h2, w2, f1 * f2)
24
+ patches = torch.nn.Unfold(f1)(input).view((b, c, f1 * f2, h2, w2))
25
+ return torch.einsum("bhwf,bcfhw->bchw", t_filters, patches)
26
+
27
+
28
+ def _meshgrid(device, diameter):
29
+ dist_range = torch.linspace(-1, 1, diameter, device=device)
30
+ x, y = torch.meshgrid(dist_range, dist_range, indexing="ij")
31
+ return torch.cat([x.unsqueeze(0), y.unsqueeze(0)], dim=0)
32
+
33
+
34
+ class Bilinear(torch.nn.Module):
35
+ def forward(self, source, guidance):
36
+ _, _, h, w = guidance.shape
37
+ return F.interpolate(source, (h, w), mode="bilinear")
38
+
39
+
40
+ class LayeredResizeConv(torch.nn.Module):
41
+ def __init__(self, dim, kernel_size=1, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+ self.conv1 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
44
+ self.conv2 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
45
+ self.conv3 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
46
+ self.conv4 = nn.Conv2d(dim + 3, dim, kernel_size, padding="same")
47
+
48
+ def apply_conv(self, source, guidance, conv, activation):
49
+ big_source = F.interpolate(source, scale_factor=2, mode="bilinear")
50
+ _, _, h, w = big_source.shape
51
+ small_guidance = F.interpolate(guidance, (h, w), mode="bilinear")
52
+ output = activation(conv(torch.cat([big_source, small_guidance], dim=1)))
53
+ return big_source + output
54
+
55
+ def forward(self, source, guidance):
56
+ source_2 = self.apply_conv(source, guidance, self.conv1, F.relu)
57
+ source_4 = self.apply_conv(source_2, guidance, self.conv2, F.relu)
58
+ source_8 = self.apply_conv(source_4, guidance, self.conv3, F.relu)
59
+ source_16 = self.apply_conv(source_8, guidance, self.conv4, lambda x: x)
60
+ return source_16
61
+
62
+
63
+ class SimpleImplicitFeaturizer(torch.nn.Module):
64
+ def __init__(self, n_freqs=20):
65
+ super().__init__()
66
+ self.n_freqs = n_freqs
67
+ self.dim_multiplier = 2
68
+
69
+ def forward(self, x):
70
+ b, c, h, w = x.shape
71
+ dtype = x.dtype
72
+ grid_h = torch.linspace(-1, 1, h, device=x.device, dtype=dtype)
73
+ grid_w = torch.linspace(-1, 1, w, device=x.device, dtype=dtype)
74
+ feats = torch.stack(torch.meshgrid(grid_h, grid_w, indexing="ij")).unsqueeze(0)
75
+ feats = feats.broadcast_to((b, feats.shape[1], h, w))
76
+ freqs = torch.exp(torch.linspace(-2, 10, self.n_freqs, device=x.device)).to(dtype).reshape(
77
+ 1, self.n_freqs, 1, 1, 1
78
+ )
79
+ feats = (feats.unsqueeze(1) * freqs).reshape(b, self.n_freqs * self.dim_multiplier, h, w)
80
+ return torch.cat([torch.sin(feats), torch.cos(feats), x], dim=1)
81
+
82
+
83
+ class IFA(torch.nn.Module):
84
+ def __init__(self, feat_dim, num_scales=20):
85
+ super().__init__()
86
+ self.feat_dim = feat_dim
87
+ self.sin_feats = SimpleImplicitFeaturizer()
88
+ self.mlp = nn.Sequential(
89
+ nn.Conv2d(feat_dim + (num_scales * 4) + 2, feat_dim, 1),
90
+ nn.BatchNorm2d(feat_dim),
91
+ nn.LeakyReLU(),
92
+ nn.Conv2d(feat_dim, feat_dim, 1),
93
+ )
94
+
95
+ def _upsample_2x(self, source):
96
+ b, c, h, w = source.shape
97
+ dtype = source.dtype
98
+ up_source = F.interpolate(source, (h * 2, w * 2), mode="nearest")
99
+ lr_cord = torch.linspace(0, h, steps=h, device=source.device, dtype=dtype)
100
+ hr_cord = torch.linspace(0, h, steps=2 * h, device=source.device, dtype=dtype)
101
+ lr_coords = torch.stack(torch.meshgrid(lr_cord, lr_cord, indexing="ij")).unsqueeze(0)
102
+ hr_coords = torch.stack(torch.meshgrid(hr_cord, hr_cord, indexing="ij")).unsqueeze(0)
103
+ up_lr_coords = F.interpolate(lr_coords, (h * 2, w * 2), mode="nearest")
104
+ coord_diff = up_lr_coords - hr_coords
105
+ coord_diff_feats = self.sin_feats(coord_diff).to(dtype)
106
+ bcast_coord_feats = coord_diff_feats.broadcast_to((b, coord_diff_feats.shape[1], h * 2, w * 2))
107
+ return self.mlp(torch.cat([up_source, bcast_coord_feats], dim=1))
108
+
109
+ def forward(self, source, guidance):
110
+ _, _, gh, gw = guidance.shape
111
+ x = source
112
+ while x.shape[2] < gh or x.shape[3] < gw:
113
+ x = self._upsample_2x(x)
114
+ if x.shape[2] != gh or x.shape[3] != gw:
115
+ x = F.interpolate(x, (gh, gw), mode="bilinear")
116
+ return x
117
+
118
+
119
+ class JBULearnedRange(torch.nn.Module):
120
+ def __init__(self, guidance_dim, feat_dim, key_dim, scale=2, radius=3):
121
+ super().__init__()
122
+ self.scale = scale
123
+ self.radius = radius
124
+ self.diameter = self.radius * 2 + 1
125
+ self.guidance_dim = guidance_dim
126
+ self.key_dim = key_dim
127
+ self.feat_dim = feat_dim
128
+ self.range_temp = nn.Parameter(torch.tensor(0.0))
129
+ self.range_proj = nn.Sequential(
130
+ nn.Conv2d(guidance_dim, key_dim, 1, 1),
131
+ nn.GELU(),
132
+ nn.Dropout2d(0.1),
133
+ nn.Conv2d(key_dim, key_dim, 1, 1),
134
+ )
135
+ self.fixup_proj = nn.Sequential(
136
+ nn.Conv2d(guidance_dim + self.diameter ** 2, self.diameter ** 2, 1, 1),
137
+ nn.GELU(),
138
+ nn.Dropout2d(0.1),
139
+ nn.Conv2d(self.diameter ** 2, self.diameter ** 2, 1, 1),
140
+ )
141
+ self.sigma_spatial = nn.Parameter(torch.tensor(1.0))
142
+
143
+ def get_range_kernel(self, x):
144
+ GB, GC, GH, GW = x.shape
145
+ proj_x = self.range_proj(x)
146
+ proj_x_padded = F.pad(proj_x, pad=[self.radius] * 4, mode="reflect")
147
+ queries = (
148
+ torch.nn.Unfold(self.diameter)(proj_x_padded)
149
+ .reshape((GB, self.key_dim, self.diameter * self.diameter, GH, GW))
150
+ .permute(0, 1, 3, 4, 2)
151
+ )
152
+ pos_temp = self.range_temp.exp().clamp_min(1e-4).clamp_max(1e4)
153
+ return F.softmax(pos_temp * torch.einsum("bchwp,bchw->bphw", queries, proj_x), dim=1)
154
+
155
+ def get_spatial_kernel(self, device):
156
+ patch = _meshgrid(device, self.diameter)
157
+ return torch.exp(-patch.square().sum(0) / (2 * self.sigma_spatial ** 2)).reshape(
158
+ 1, self.diameter * self.diameter, 1, 1
159
+ )
160
+
161
+ def forward(self, source, guidance):
162
+ GB, GC, GH, GW = guidance.shape
163
+ SB, SC, SH, SQ = source.shape
164
+ assert SB == GB
165
+ dtype = source.dtype
166
+ guidance = guidance.to(dtype)
167
+ spatial_kernel = self.get_spatial_kernel(source.device).to(dtype)
168
+ range_kernel = self.get_range_kernel(guidance).to(dtype)
169
+ combined_kernel = (range_kernel * spatial_kernel).to(dtype)
170
+ combined_kernel /= combined_kernel.sum(1, keepdim=True).clamp(1e-7)
171
+ combined_kernel += 0.1 * self.fixup_proj(torch.cat([combined_kernel, guidance], dim=1))
172
+ combined_kernel = combined_kernel.permute(0, 2, 3, 1).reshape(
173
+ GB, GH, GW, self.diameter, self.diameter
174
+ )
175
+ hr_source = F.interpolate(source, size=(GH, GW), mode="bicubic", align_corners=False)
176
+ hr_source_padded = F.pad(hr_source, pad=[self.radius] * 4, mode="reflect")
177
+ combined_kernel = combined_kernel.to(hr_source_padded.dtype)
178
+ if AdaptiveConv is not None:
179
+ result = AdaptiveConv.apply(hr_source_padded, combined_kernel)
180
+ else:
181
+ result = adaptive_conv_py_simple(hr_source_padded, combined_kernel)
182
+ return result
183
+
184
+
185
+ class JBUStack(torch.nn.Module):
186
+ def __init__(self, feat_dim, *args, **kwargs):
187
+ super().__init__(*args, **kwargs)
188
+ self.up1 = JBULearnedRange(3, feat_dim, 32, radius=3)
189
+ self.up2 = JBULearnedRange(3, feat_dim, 32, radius=3)
190
+ self.up3 = JBULearnedRange(3, feat_dim, 32, radius=3)
191
+ self.up4 = JBULearnedRange(3, feat_dim, 32, radius=3)
192
+ self.fixup_proj = nn.Sequential(
193
+ nn.Dropout2d(0.2),
194
+ nn.Conv2d(feat_dim, feat_dim, kernel_size=1),
195
+ )
196
+
197
+ def upsample(self, source, guidance, up):
198
+ _, _, h, w = source.shape
199
+ small_guidance = F.adaptive_avg_pool2d(guidance, (h * 2, w * 2))
200
+ return up(source, small_guidance)
201
+
202
+ def forward(self, source, guidance):
203
+ source_2 = self.upsample(source, guidance, self.up1)
204
+ source_4 = self.upsample(source_2, guidance, self.up2)
205
+ source_8 = self.upsample(source_4, guidance, self.up3)
206
+ source_16 = self.upsample(source_8, guidance, self.up4)
207
+ return self.fixup_proj(source_16) * 0.1 + source_16
208
+
209
+
210
+ class JBUOne(torch.nn.Module):
211
+ def __init__(self, feat_dim, *args, **kwargs):
212
+ super().__init__(*args, **kwargs)
213
+ self.up = JBULearnedRange(3, feat_dim, 32, radius=5)
214
+ self.fixup_proj = nn.Sequential(
215
+ nn.Dropout2d(0.2),
216
+ nn.Conv2d(feat_dim, feat_dim, kernel_size=1),
217
+ )
218
+
219
+ def upsample(self, source, guidance, up):
220
+ _, _, h, w = source.shape
221
+ small_guidance = F.adaptive_avg_pool2d(guidance, (h * 2, w * 2))
222
+ return up(source, small_guidance)
223
+
224
+ def forward(self, source, guidance):
225
+ source_2 = self.upsample(source, guidance, self.up)
226
+ source_4 = self.upsample(source_2, guidance, self.up)
227
+ source_8 = self.upsample(source_4, guidance, self.up)
228
+ source_16 = self.upsample(source_8, guidance, self.up)
229
+ return self.fixup_proj(source_16) * 0.1 + source_16
230
+
231
+
232
+ FEATUP_CHECKPOINTS = {
233
+ "jbu_one": "simfeatup/xclip_jbu_one_million_aid.ckpt",
234
+ "jbu_stack": "simfeatup/clip_jbu_stack_cocostuff.ckpt",
235
+ "jbu_stack_maskclip": "simfeatup/maskclip_jbu_stack_cocostuff.ckpt",
236
+ }
237
+
238
+
239
+ def get_upsampler(name: str, feat_dim: int):
240
+ if name == "bilinear":
241
+ return Bilinear()
242
+ elif name == "jbu_one":
243
+ return JBUOne(feat_dim)
244
+ elif name == "jbu_stack":
245
+ return JBUStack(feat_dim)
246
+ elif name == "resize_conv":
247
+ return LayeredResizeConv(feat_dim, 1)
248
+ elif name == "ifa":
249
+ return IFA(feat_dim)
250
+ else:
251
+ raise ValueError(f"Unknown upsampler: {name}. Use: bilinear, jbu_one, jbu_stack, resize_conv, ifa")
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/README.md ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ pipeline_tag: image-segmentation
4
+ tags:
5
+ - clip
6
+ - vision
7
+ - remote-sensing
8
+ - sar
9
+ - earth-observation
10
+ - open-vocabulary
11
+ library_name: transformers
12
+ ---
13
+
14
+ # AlignEarth-SAR-ViT-B-16
15
+
16
+ CLIP-style vision-language model adapted for **Synthetic Aperture Radar (SAR)** imagery via knowledge distillation from optical VLMs. Enables open-vocabulary semantic segmentation for SAR remote sensing without building SAR foundation models from scratch.
17
+
18
+ This repository provides the model in **Hugging Face Transformers** format, converted from the original [OpenCLIP-style checkpoint](https://huggingface.co/likyoo/AlignEarth-SAR-ViT-B-16) released by the SegEarth-OV-2 authors.
19
+
20
+ ## Model Details
21
+
22
+ - **Architecture**: CLIP (ViT-B/16 vision encoder + text encoder)
23
+ - **Vision**: 12-layer ViT, 768 hidden, 16×16 patches, 224×224 input
24
+ - **Text**: 12-layer transformer, 512 hidden, vocab 49408, max length 77
25
+ - **Projection**: 512-dim shared embedding space
26
+ - **Source**: [likyoo/AlignEarth-SAR-ViT-B-16](https://huggingface.co/likyoo/AlignEarth-SAR-ViT-B-16) (OpenCLIP format)
27
+ - **Conversion**: Mapped to `transformers.CLIPModel` for standard HF usage
28
+ - **SimFeatUp**: Full upsampler suite from [SegEarth-OV/simfeatup_dev](https://github.com/likyoo/SegEarth-OV/tree/main/simfeatup_dev):
29
+ - `jbu_one` → `simfeatup/xclip_jbu_one_million_aid.ckpt` (default, remote-sensing)
30
+ - `jbu_stack` → `simfeatup/clip_jbu_stack_cocostuff.ckpt`
31
+ - `jbu_stack_maskclip` → `simfeatup/maskclip_jbu_stack_cocostuff.ckpt`
32
+ - `bilinear`, `resize_conv`, `ifa` (no pretrained weights)
33
+
34
+ ## Usage
35
+
36
+ ```python
37
+ from transformers import CLIPModel, CLIPProcessor
38
+ from PIL import Image
39
+
40
+ model = CLIPModel.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
41
+ processor = CLIPProcessor.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
42
+
43
+ image = Image.open("sar_image.tif")
44
+ texts = ["building", "road", "water body", "vegetation"]
45
+
46
+ inputs = processor(
47
+ text=texts,
48
+ images=image,
49
+ return_tensors="pt",
50
+ padding=True,
51
+ )
52
+
53
+ outputs = model(**inputs)
54
+ logits_per_image = outputs.logits_per_image
55
+ probs = logits_per_image.softmax(dim=1)
56
+ ```
57
+
58
+ For dense features (e.g., segmentation with SegEarth-OV-2), use the vision encoder:
59
+
60
+ ```python
61
+ from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
62
+
63
+ vision_model = CLIPVisionModelWithProjection.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
64
+ processor = CLIPImageProcessor.from_pretrained("BiliSakura/AlignEarth-SAR-ViT-B-16")
65
+
66
+ inputs = processor(images=image, return_tensors="pt")
67
+ outputs = vision_model(**inputs)
68
+ image_embeds = outputs.image_embeds # pooled
69
+ # Or use vision_model.vision_model for patch-level features
70
+ ```
71
+
72
+ ## Full Pipeline (SegEarth-OV-2 Style)
73
+
74
+ For **open-vocabulary SAR segmentation** with SimFeatUp dense upsampling:
75
+
76
+ ```python
77
+ from pathlib import Path
78
+ from PIL import Image
79
+ from pipeline import SegEarthPipeline
80
+
81
+ pipe = SegEarthPipeline(Path("BiliSakura/AlignEarth-SAR-ViT-B-16"))
82
+ image = Image.open("your_sar_image.tif").convert("RGB")
83
+ seg_map = pipe(image) # [H, W] class indices
84
+ ```
85
+
86
+ The pipeline combines:
87
+ - **AlignEarth** CLIP encoder (SAR-adapted)
88
+ - **SimFeatUp** upsampler (choose `jbu_one`, `jbu_stack`, `jbu_stack_maskclip`, or `bilinear`)
89
+ - **Global Bias Alleviation** (cls_token_lambda) – subtracts global context from patch features
90
+ - **Logit scaling** and **prob threshold** for robust predictions
91
+ - **Sliding window** for large images
92
+ - OpenEarthMap SAR class names (customize via `cls_openearthmap_sar.txt` or `configs/cls_*.txt`)
93
+
94
+ ```python
95
+ # Use different featup models
96
+ pipe = SegEarthPipeline(Path("."), featup_model="jbu_one") # default
97
+ pipe = SegEarthPipeline(Path("."), featup_model="jbu_stack")
98
+ pipe = SegEarthPipeline(Path("."), featup_model="jbu_stack_maskclip")
99
+ pipe = SegEarthPipeline(Path("."), featup_model="bilinear") # no weights
100
+
101
+ # Full SegEarth-OV-2 options
102
+ pipe = SegEarthPipeline(Path("."), cls_token_lambda=-0.3, logit_scale=50, prob_thd=0)
103
+ pipe = SegEarthPipeline(Path("."), slide_crop=224, slide_stride=112) # sliding window for large images
104
+ pipe = SegEarthPipeline(Path("."), class_names_path="configs/cls_whu_sar.txt") # different dataset
105
+ ```
106
+
107
+ ## Demo / Test
108
+
109
+ A paired demo sample from **YESeg-OPT-SAR** is in `demo_YESeg-OPT-SAR/`: `sar.png`, `rgb.png`, `label.png`. **Note**: This model targets SAR imagery, not optical.
110
+
111
+ ```bash
112
+ python test_demo.py # uses demo_YESeg-OPT-SAR, cls_yeseg_sar, prob_thd=0.3
113
+ python test_demo.py --featup jbu_stack # try jbu_stack upsampler
114
+ python test_demo.py --save out.png # save figure
115
+ ```
116
+
117
+ The script displays a matplotlib image grid: RGB | SAR | Label (GT) | Prediction.
118
+
119
+ ## Evaluation
120
+
121
+ Standalone evaluation (no mmseg) on image/label pairs:
122
+
123
+ ```bash
124
+ python eval.py --img-dir data/OpenEarthMap_SAR/test/sar_images \\
125
+ --label-dir data/OpenEarthMap_SAR/test/labels \\
126
+ --config configs/cls_openearthmap_sar.txt
127
+ ```
128
+
129
+ SAR class configs in `configs/`: `cls_openearthmap_sar.txt`, `cls_whu_sar.txt`, `cls_hrsid.txt`, `cls_pie_sar.txt`, `cls_fusar.txt`, `cls_yeseg_sar.txt`, `cls_ddhrnet_xian_sar.txt`.
130
+
131
+ Or from Python:
132
+
133
+ ```python
134
+ from pathlib import Path
135
+ from pipeline import SegEarthPipeline
136
+ from PIL import Image
137
+
138
+ pipe = SegEarthPipeline(Path("."))
139
+ image = Image.open("demo/sar.png").convert("RGB")
140
+ seg = pipe(image)
141
+ ```
142
+
143
+ ## Citation
144
+
145
+ If you use this model, please cite the SegEarth-OV-2 paper:
146
+
147
+ ```bibtex
148
+ @article{li2025segearthov2,
149
+ title={Annotation-Free Open-Vocabulary Segmentation for Remote-Sensing Images},
150
+ author={Li, Kaiyu and Cao, Xiangyong and Liu, Ruixun and Wang, Shihong and Jiang, Zixuan and Wang, Zhi and Meng, Deyu},
151
+ journal={arXiv preprint arXiv:2508.18067},
152
+ year={2025}
153
+ }
154
+ ```
155
+
156
+ ## License
157
+
158
+ MIT License (inherited from the original AlignEarth release).
159
+
160
+ ## Dependencies
161
+
162
+ - `transformers`, `torch`, `torchvision`, `PIL`
163
+ - Optional: `featup` for CUDA-accelerated JBU (falls back to pure PyTorch)
164
+ - Optional: `mmcv` for CarafeUpsampler, `sapa` for SAPAUpsampler
165
+
166
+ ## Related
167
+
168
+ - **Original weights**: [likyoo/AlignEarth-SAR-ViT-B-16](https://huggingface.co/likyoo/AlignEarth-SAR-ViT-B-16)
169
+ - **Code**: [SegEarth-OV-2](https://github.com/earth-insights/SegEarth-OV-2)
170
+ - **Paper**: [arXiv:2508.18067](https://arxiv.org/abs/2508.18067)
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/cls_openearthmap_sar.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ background
2
+ bareland,barren
3
+ grass
4
+ pavement
5
+ road
6
+ tree,forest
7
+ water,river
8
+ cropland
9
+ building,roof,house
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPModel"
4
+ ],
5
+ "dtype": "float32",
6
+ "initializer_factor": 1.0,
7
+ "logit_scale_init_value": 2.6592,
8
+ "model_type": "clip",
9
+ "projection_dim": 512,
10
+ "text_config": {
11
+ "attention_dropout": 0.0,
12
+ "hidden_act": "quick_gelu",
13
+ "hidden_size": 512,
14
+ "initializer_factor": 1.0,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 2048,
17
+ "layer_norm_eps": 1e-05,
18
+ "max_position_embeddings": 77,
19
+ "model_type": "clip_text_model",
20
+ "num_attention_heads": 8,
21
+ "num_hidden_layers": 12,
22
+ "projection_dim": 512,
23
+ "vocab_size": 49408
24
+ },
25
+ "transformers_version": "4.57.3",
26
+ "vision_config": {
27
+ "attention_dropout": 0.0,
28
+ "hidden_act": "quick_gelu",
29
+ "hidden_size": 768,
30
+ "image_size": 224,
31
+ "initializer_factor": 1.0,
32
+ "initializer_range": 0.02,
33
+ "intermediate_size": 3072,
34
+ "layer_norm_eps": 1e-05,
35
+ "model_type": "clip_vision_model",
36
+ "num_attention_heads": 12,
37
+ "num_channels": 3,
38
+ "num_hidden_layers": 12,
39
+ "patch_size": 16,
40
+ "projection_dim": 512
41
+ }
42
+ }
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_ddhrnet_xian_sar.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ building
2
+ road
3
+ farmland,grass
4
+ greenery
5
+ water
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_fusar.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ others
2
+ water
3
+ road
4
+ building
5
+ grass
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_hrsid.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ background
2
+ ship,boat
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_openearthmap_sar.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ background
2
+ bareland,barren
3
+ grass
4
+ pavement
5
+ road
6
+ tree,forest
7
+ water,river
8
+ cropland
9
+ building,roof,house
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_pie_sar.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ background
2
+ city
3
+ road
4
+ water,river
5
+ forest
6
+ cropland
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_whu_sar.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ farmland
2
+ city
3
+ village
4
+ water
5
+ forest
6
+ road
7
+ others
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/configs/cls_yeseg_sar.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ bareground,barren
2
+ grass,farmland
3
+ dense tree cover
4
+ city
5
+ water
6
+ roadway
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/conversion_meta.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "source_checkpoint": "/data/projects/models/hf_models/likyoo/AlignEarth-SAR-ViT-B-16/AlignEarth-SAR-ViT-B-16.pt",
3
+ "source_top_level_keys": [
4
+ "epoch",
5
+ "name",
6
+ "state_dict",
7
+ "optimizer",
8
+ "scaler"
9
+ ],
10
+ "converted_to": "transformers.CLIPModel",
11
+ "notes": "Mapped OpenCLIP-style state_dict to HF CLIPModel key format (vision+text)."
12
+ }
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Demo sample from YESeg-OPT-SAR dataset
2
+
3
+ This folder contains a paired sample from the **YESeg-OPT-SAR** dataset:
4
+ - `sar.png` – SAR image (input)
5
+ - `rgb.png` – Optical RGB reference
6
+ - `label.png` / `cvt_label.png` – Ground truth segmentation
7
+
8
+ Use `configs/cls_yeseg_sar.txt` and `--prob-thd 0.3` when running on this demo.
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/cvt_label.png ADDED
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/label.png ADDED
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/rgb.png ADDED
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_YESeg-OPT-SAR/sar.png ADDED
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/demo_result.png ADDED

Git LFS Details

  • SHA256: 67150d60f8d6993e88a7c0c0b873a71a43ce13430c8f986f05e3dad94d1acff9
  • Pointer size: 131 Bytes
  • Size of remote file: 654 kB
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/eval.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Standalone evaluation script for AlignEarth segmentation.
3
+ Evaluates on datasets with image/label pairs. Computes mIoU.
4
+ Usage:
5
+ python eval.py --img-dir data/OpenEarthMap_SAR/test/sar_images \\
6
+ --label-dir data/OpenEarthMap_SAR/test/labels \\
7
+ --config configs/cls_openearthmap_sar.txt
8
+ """
9
+ import argparse
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ from PIL import Image
14
+
15
+ try:
16
+ from tqdm import tqdm
17
+ except ImportError:
18
+ def tqdm(x, **kw):
19
+ return x
20
+
21
+ from pipeline import SegEarthPipeline
22
+
23
+
24
+ def load_label(path: Path, reduce_zero_label: bool = False) -> np.ndarray:
25
+ """Load label as index map. Handles PNG (index or RGB) and single-channel."""
26
+ arr = np.array(Image.open(path))
27
+ if arr.ndim == 3:
28
+ # RGB label: convert to index if needed (dataset-specific)
29
+ arr = arr[:, :, 0] if arr.shape[2] >= 1 else arr
30
+ if reduce_zero_label and arr.max() > 0:
31
+ arr = arr - 1
32
+ arr[arr < 0] = 255 # ignore
33
+ return arr.astype(np.int64)
34
+
35
+
36
+ def compute_iou(pred: np.ndarray, gt: np.ndarray, num_classes: int, ignore_index: int = 255) -> np.ndarray:
37
+ """Per-class IoU. Returns array of length num_classes."""
38
+ ious = np.zeros(num_classes)
39
+ for c in range(num_classes):
40
+ pred_c = pred == c
41
+ gt_c = gt == c
42
+ if gt_c.sum() == 0:
43
+ ious[c] = np.nan
44
+ continue
45
+ intersection = (pred_c & gt_c).sum()
46
+ union = (pred_c | gt_c).sum()
47
+ if union == 0:
48
+ ious[c] = np.nan
49
+ else:
50
+ ious[c] = intersection / union
51
+ return ious
52
+
53
+
54
+ def main():
55
+ parser = argparse.ArgumentParser()
56
+ parser.add_argument("--img-dir", required=True, help="Directory of images")
57
+ parser.add_argument("--label-dir", required=True, help="Directory of labels")
58
+ parser.add_argument("--config", default="configs/cls_openearthmap_sar.txt", help="Class config file")
59
+ parser.add_argument("--img-suffix", default=".png", help="Image suffix")
60
+ parser.add_argument("--label-suffix", default=".png", help="Label suffix")
61
+ parser.add_argument("--reduce-zero-label", action="store_true", help="Label 1..N -> 0..N-1")
62
+ parser.add_argument("--save-pred-dir", help="Save predictions to directory")
63
+ parser.add_argument("--featup", default="jbu_one", help="SimFeatUp model")
64
+ parser.add_argument("--slide-crop", type=int, default=0, help="Sliding window crop size (0=disabled)")
65
+ parser.add_argument("--slide-stride", type=int, default=112, help="Sliding window stride")
66
+ parser.add_argument("--cls-token-lambda", type=float, default=-0.3, help="Global Bias Alleviation")
67
+ parser.add_argument("--logit-scale", type=float, default=50.0, help="Softmax temperature")
68
+ parser.add_argument("--prob-thd", type=float, default=0.0, help="Low-confidence threshold")
69
+ parser.add_argument("--limit", type=int, default=0, help="Limit number of samples (0=all)")
70
+ args = parser.parse_args()
71
+
72
+ repo_dir = Path(__file__).resolve().parent
73
+ img_dir = Path(args.img_dir)
74
+ label_dir = Path(args.label_dir)
75
+ config_path = Path(args.config)
76
+ if not config_path.is_absolute():
77
+ config_path = repo_dir / config_path
78
+
79
+ if not img_dir.exists():
80
+ raise FileNotFoundError(f"Image dir not found: {img_dir}")
81
+ if not label_dir.exists():
82
+ raise FileNotFoundError(f"Label dir not found: {label_dir}")
83
+ if not config_path.exists():
84
+ raise FileNotFoundError(f"Config not found: {config_path}")
85
+
86
+ # Collect images
87
+ img_files = sorted(img_dir.glob(f"*{args.img_suffix}"))
88
+ if args.limit:
89
+ img_files = img_files[: args.limit]
90
+
91
+ if not img_files:
92
+ raise FileNotFoundError(f"No images in {img_dir}")
93
+
94
+ # Build pipeline
95
+ pipe = SegEarthPipeline(
96
+ repo_dir,
97
+ featup_model=args.featup,
98
+ class_names_path=config_path,
99
+ cls_token_lambda=args.cls_token_lambda,
100
+ logit_scale=args.logit_scale,
101
+ prob_thd=args.prob_thd,
102
+ slide_crop=args.slide_crop,
103
+ slide_stride=args.slide_stride,
104
+ device="cuda",
105
+ )
106
+ num_classes = pipe.num_classes
107
+
108
+ save_pred_dir = Path(args.save_pred_dir) if args.save_pred_dir else None
109
+ if save_pred_dir:
110
+ save_pred_dir.mkdir(parents=True, exist_ok=True)
111
+
112
+ all_ious = []
113
+ for img_path in tqdm(img_files, desc="Evaluating"):
114
+ label_name = img_path.stem + args.label_suffix
115
+ label_path = label_dir / label_name
116
+ if not label_path.exists():
117
+ label_path = label_dir / (img_path.name.replace(args.img_suffix, args.label_suffix))
118
+ if not label_path.exists():
119
+ tqdm.write(f"Skipping {img_path.name}: no label")
120
+ continue
121
+
122
+ img = Image.open(img_path).convert("RGB")
123
+ gt = load_label(label_path, reduce_zero_label=args.reduce_zero_label)
124
+
125
+ pred = pipe(img)
126
+ pred_np = pred.cpu().numpy()
127
+
128
+ # Resize pred to match gt if needed
129
+ if pred_np.shape != gt.shape:
130
+ from PIL import Image as PILImage
131
+ pred_pil = PILImage.fromarray(pred_np.astype(np.uint8))
132
+ pred_pil = pred_pil.resize((gt.shape[1], gt.shape[0]), PILImage.NEAREST)
133
+ pred_np = np.array(pred_pil)
134
+
135
+ # Mask ignore
136
+ valid = gt != 255
137
+ if valid.sum() == 0:
138
+ continue
139
+ pred_m = pred_np.copy()
140
+ pred_m[~valid] = 255
141
+ gt_m = gt.copy()
142
+ gt_m[~valid] = 255
143
+
144
+ ious = compute_iou(pred_m, gt_m, num_classes, ignore_index=255)
145
+ all_ious.append(ious)
146
+
147
+ if save_pred_dir:
148
+ out_path = save_pred_dir / (img_path.stem + "_pred.png")
149
+ Image.fromarray(pred_np.astype(np.uint8)).save(out_path)
150
+
151
+ if not all_ious:
152
+ print("No valid samples.")
153
+ return
154
+
155
+ all_ious = np.array(all_ious)
156
+ mean_iou = np.nanmean(all_ious, axis=0)
157
+ miou = np.nanmean(mean_iou)
158
+ print(f"mIoU: {miou:.4f}")
159
+ print("Per-class IoU:", mean_iou)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
OV-2/weights/backbone/AlignEarth-SAR-ViT-B-16/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:604f0111c635195ec1a723d6a256f476b2c272f330f186a8edeec9f81a4cb560
3
+ size 598530372