upload model

Browse files

Files changed (4) hide show

README.md +119 -3
config.json +18 -0
levir_vocab.json +1 -0
pytorch_model.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,119 @@
----
-license: mit
----

+---
+language:
+- en
+license: mit
+library_name: pytorch
+tags:
+- remote-sensing
+- change-detection
+- image-captioning
+- multimodal
+- retrieval
+datasets:
+- levir-cc
+pipeline_tag: image-to-text
+---
+# RSICRC: Multimodal Remote Sensing Image Change Retrieval and Captioning
+**RSICRC** is a multimodal foundation model designed for **bi-temporal remote sensing images**. It jointly performs **change captioning** (describing changes between two images) and **text-image retrieval** (finding image pairs that match a text description).
+The model leverages Contrastive Learning and a decoupled decoder architecture to handle both tasks simultaneously.
+## 📄 Paper
+**Towards a Multimodal Framework for Remote Sensing Image Change Retrieval and Captioning** *Roger Ferrod, Luigi Di Caro, Dino Ienco* Published at **Discovery Science 2024**
+[**Read the Paper**](https://doi.org/10.1007/978-3-031-78980-9_15) | [**GitHub Repository**](https://github.com/rogerferrod/RSICRC)
+## 🏗️ Model Architecture
+The framework is inspired by **CoCa** but adapted for bi-temporal remote sensing data.
+* **Encoder:** A Siamese network (ResNet-50 or ViT via OpenCLIP) that encodes "before" and "after" images. A Hierarchical Self-Attention (HSA) block and a residual block with a cosine mask fuse the bi-temporal features.
+* **Decoder:** A decoupled Transformer decoder split into:
+    * **Unimodal Layers:** Encode text only (used for contrastive alignment).
+    * **Multimodal Layers:** Apply cross-attention between visual and textual features to generate captions.
+## 💻 Usage
+To use this model use the custom source code.
+### Inference Code
+```python
+import torch
+import json
+import open_clip
+from huggingface_hub import hf_hub_download
+from src.model import ICCModel
+# 1. Download necessary files from Hugging Face
+repo_id = "rogerferrod/RSICRC"
+config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
+vocab_path = hf_hub_download(repo_id=repo_id, filename="levir_vocab.json")
+weights_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin")
+# 2. Load Configuration and Vocabulary
+with open(config_path, 'r') as f:
+    config = json.load(f)
+with open(vocab_path, 'r') as f:
+    vocab = json.load(f)
+# 3. Setup Device and Backbone (OpenCLIP)
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+clip_model, _, preprocess = open_clip.create_model_and_transforms(config['backbone'])
+# 4. Initialize the Model
+model = ICCModel(
+    device=device,
+    clip=clip_model,
+    backbone=config['backbone'],
+    d_model=config['d_model'],
+    vocab_size=len(vocab),
+    max_len=config['max_len'],
+    num_heads=config['num_heads'],
+    h_dim=config['h_dim'],
+    a_dim=config['a_dim'],
+    encoder_layers=config['encoder_layers'],
+    decoder_layers=config['decoder_layers'],
+    dropout=config['dropout'],
+    learnable=config['learnable'],
+    fine_tune=config['fine_tune'],
+    tie_embeddings=config['tie_embeddings'],
+    prenorm=config['prenorm']
+)
+# 5. Load Weights
+model.load_state_dict(torch.load(weights_path, map_location=device))
+model = model.to(device)
+model.eval()
+print("Model loaded successfully!")
+```
+## 📚 Citation
+If you use this model or code in your research, please cite our paper:
+```bibtext
+@InProceedings{10.1007/978-3-031-78980-9_15,
+author       = {Roger Ferrod and
+                  Luigi Di Caro and
+                  Dino Ienco},
+editor       = {Dino Pedreschi and
+                Anna Monreale and
+                Riccardo Guidotti and
+                Roberto Pellungrini and
+                Francesca Naretto},
+title        = {Towards a Multimodal Framework for Remote Sensing Image Change Retrieval
+                and Captioning},
+booktitle    = {Discovery Science - 27th International Conference, {DS} 2024, Pisa,
+                Italy, October 14-16, 2024, Proceedings, Part {II}},
+series       = {Lecture Notes in Computer Science},
+volume       = {15244},
+pages        = {231--245},
+publisher    = {Springer},
+year         = {2024},
+url          = {https://doi.org/10.1007/978-3-031-78980-9\_15},
+doi          = {10.1007/978-3-031-78980-9\_15}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "backbone": "RN50",
+  "d_model": 2048,
+  "max_len": 41,
+  "encoder_layers": 3,
+  "decoder_layers": 1,
+  "num_heads": 8,
+  "h_dim": 512,
+  "a_dim": 2048,
+  "dropout": 0.1,
+  "learnable": false,
+  "fine_tune": true,
+  "tie_embeddings": true,
+  "prenorm": false,
+  "s-transformers": "sentence-transformers/msmarco-distilbert-cos-v5",
+  "s-threshold": 1.0,
+  "fna": true
+}

levir_vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"the": 4, "a": 5, "is": 6, "has": 7, "road": 8, "and": 9, "built": 10, "are": 11, "no": 12, "houses": 13, "on": 14, "of": 15, "two": 16, "scene": 17, "in": 18, "some": 19, "roads": 20, "appear": 21, "as": 22, "almost": 23, "there": 24, "changed": 25, "change": 26, "same": 27, "before": 28, "occurred": 29, "identical": 30, "seem": 31, "scenes": 32, "difference": 33, "nothing": 34, "buildings": 35, "bareland": 36, "many": 37, "with": 38, "appears": 39, "been": 40, "at": 41, "trees": 42, "along": 43, "constructed": 44, "villas": 45, "several": 46, "have": 47, "building": 48, "corner": 49, "removed": 50, "sides": 51, "both": 52, "house": 53, "top": 54, "right": 55, "left": 56, "bottom": 57, "beside": 58, "to": 59, "parking": 60, "woods": 61, "rows": 62, "lot": 63, "by": 64, "area": 65, "forest": 66, "side": 67, "near": 68, "around": 69, "plants": 70, "original": 71, "residential": 72, "desert": 73, "new": 74, "replace": 75, "large": 76, "row": 77, "three": 78, "small": 79, "vegetation": 80, "replaced": 81, "disappear": 82, "grass": 83, "neatly": 84, "upper": 85, "one": 86, "up": 87, "more": 88, "arranged": 89, "appeared": 90, "into": 91, "next": 92, "detached": 93, "open": 94, "space": 95, "disappears": 96, "lots": 97, "wasteland": 98, "villa": 99, "clearing": 100, "it": 101, "path": 102, "meadow": 103, "massive": 104, "crossroad": 105, "across": 106, "show": 107, "center": 108, "between": 109, "middle": 110, "cement": 111, "edge": 112, "lower-right": 113, "four": 114, "becomes": 115, "among": 116, "concrete": 117, "lower-left": 118, "developed": 119, "winding": 120, "former": 121, "grassland": 122, "few": 123, "straight": 124, "completed": 125, "vertical": 126, "an": 127, "rebuilt": 128, "shows": 129, "alongside": 130, "square": 131, "replaces": 132, "t-shaped": 133, "newly": 134, "them": 135, "cross": 136, "erected": 137, "most": 138, "roadside": 139, "connected": 140, "end": 141, "cars": 142, "main": 143, "ground": 144, "big": 145, "crossing": 146, "neat": 147, "turning": 148, "lake": 149, "part": 150, "lines": 151, "pool": 152, "another": 153, "added": 154, "this": 155, "old": 156, "bushes": 157, "woodland": 158, "half": 159, "all": 160, "become": 161, "ones": 162, "scattered": 163, "disappeared": 164, "construction": 165, "lower": 166, "other": 167, "above": 168, "replacing": 169, "line": 170, "branch": 171, "through": 172, "paths": 173, "located": 174, "existing": 175, "place": 176, "below": 177, "group": 178, "wide": 179, "street": 180, "surrounded": 181, "lush": 182, "sparse": 183, "dirt": 184, "front": 185, "turned": 186, "field": 187, "huge": 188, "dense": 189, "bare": 190, "surrounding": 191, "extended": 192, "reconstructed": 193, "swimming": 194, "down": 195, "ring": 196, "cut": 197, "track": 198, "runs": 199, "cleared": 200, "giant": 201, "land": 202, "room": 203, "intersecting": 204, "curved": 205, "parallel": 206, "blocks": 207, "widened": 208, "site": 209, "constructions": 210, "situated": 211, "join": 212, "circular": 213, "white": 214, "from": 215, "complex": 216, "parked": 217, "bypass": 218, "five": 219, "while": 220, "narrow": 221, "vanished": 222, "screen": 223, "playground": 224, "turn": 225, "distributed": 226, "long": 227, "wood": 228, "lusher": 229, "grow": 230, "planted": 231, "areas": 232, "turns": 233, "rooms": 234, "green": 235, "crossroads": 236, "nearby": 237, "extends": 238, "vehicles": 239, "either": 240, "water": 241, "single": 242, "round": 243, "grown": 244, "arc": 245, "decreases": 246, "separate": 247, "mansion": 248, "circle": 249, "run": 250, "storage": 251, "pools": 252, "shrubs": 253, "staggered": 254, "withered": 255, "for": 256, "expanded": 257, "grows": 258, "forests": 259, "bungalow": 260, "others": 261, "quantity": 262, "finished": 263, "places": 264, "mansions": 265, "warehouse": 266, "factory": 267, "asphalt": 268, "tanks": 269, "medium": 270, "angle": 271, "intersection": 272, "surround": 273, "picture": 274, "sites": 275, "parts": 276, "corners": 277, "renovated": 278, "level": 279, "leading": 280, "bigger": 281, "higher": 282, "filled": 283, "floor": 284, "structures": 285, "bungalows": 286, "roadsides": 287, "covered": 288, "certain": 289, "river": 290, "park": 291, "roundabout": 292, "lined": 293, "pond": 294, "densely": 295, "increased": 296, "branches": 297, "extend": 298, "its": 299, "facilities": 300, "fields": 301, "mall": 302, "greatly": 303, "trucks": 304, "running": 305, "stand": 306, "attached": 307, "yards": 308, "connecting": 309, "larger": 310, "vacant": 311, "wider": 312, "restored": 313, "formed": 314, "unsurfaced": 315, "extra": 316, "reservoir": 317, "number": 318, "under": 319, "basketball": 320, "central": 321, "out": 322, "jungle": 323, "decreased": 324, "vanishes": 325, "much": 326, "risen": 327, "behind": 328, "court": 329, "fill": 330, "converge": 331, "well": 332, "village": 333, "red": 334, "longer": 335, "yard": 336, "block": 337, "playgrounds": 338, "horizontal": 339, "each": 340, "bypasses": 341, "closely": 342, "piece": 343, "greener": 344, "tower": 345, "that": 346, "divided": 347, "tracks": 348, "meadows": 349, "where": 350, "spaced": 351, "build": 352, "tank": 353, "reshaped": 354, "widens": 355, "abandoned": 356, "warehouses": 357, "demolished": 358, "malls": 359, "trails": 360, "moorland": 361, "connects": 362, "squares": 363, "rugged": 364, "t-junction": 365, "only": 366, "containers": 367, "full": 368, "less": 369, "make": 370, "dry": 371, "going": 372, "leveled": 373, "groups": 374, "were": 375, "reduced": 376, "broadened": 377, "uneven": 378, "image": 379, "transformed": 380, "respective": 381, "but": 382, "streets": 383, "yellow": 384, "joint": 385, "viaducts": 386, "flat": 387, "enlarged": 388, "orderly": 389, "foundation": 390, "thicker": 391, "smaller": 392, "trail": 393, "mounds": 394, "realized": 395, "which": 396, "increases": 397, "pasture": 398, "growing": 399, "was": 400, "structure": 401, "courts": 402, "similar": 403, "car": 404, "converted": 405, "surrounds": 406, "form": 407, "consisting": 408, "blue": 409, "complexes": 410, "opposite": 411, "shaped": 412, "besides": 413, "depots": 414, "remaining": 415, "brushwoods": 416, "divide": 417, "lane": 418, "intersect": 419, "bank": 420, "roof": 421, "fewer": 422, "u-shaped": 423, "empty": 424, "t": 425, "ponds": 426, "cottage": 427, "sundries": 428, "overpasses": 429, "additional": 430, "denser": 431, "fades": 432, "width": 433, "pulled": 434, "directions": 435, "platform": 436, "luxuriant": 437, "stretches": 438, "vibrant": 439, "rectangular": 440, "vitals": 441, "stadium": 442, "flattened": 443, "playing": 444, "spaces": 445, "stuff": 446, "hole": 447, "viaduct": 448, "raw": 449, "take": 450, "barelands": 451, "stretch": 452, "continues": 453, "tree": 454, "woodlands": 455, "face": 456, "comes": 457, "six": 458, "cluster": 459, "takes": 460, "uncompleted": 461, "moved": 462, "boxes": 463, "missing": 464, "curve": 465, "bridges": 466, "different": 467, "PAD": 0, "START": 1, "UNK": 2, "END": 3}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b55e57f2281a6ba81a6bfef363905e06ab6eaf3b0d11152ab70083cf9c7c731
+size 1404020822