Upload GeoRSCLIP-ViT-H-14

Browse files

Files changed (12) hide show

GeoRSCLIP-ViT-H-14.pt +3 -0
README.md +37 -0
diffusers/text_encoder/config.json +24 -0
diffusers/text_encoder/model.safetensors +3 -0
transformers/config.json +46 -0
transformers/merges.txt +0 -0
transformers/model.safetensors +3 -0
transformers/processor_config.json +31 -0
transformers/special_tokens_map.json +30 -0
transformers/tokenizer.json +0 -0
transformers/tokenizer_config.json +32 -0
transformers/vocab.json +0 -0

GeoRSCLIP-ViT-H-14.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62a3e79df886875976901b4a9b8d6a1c42b457879a5ce625c8eba9460cbc3a11
+size 3944697282

README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+---
+license: mit
+tags:
+- clip
+- feature-extraction
+- remote-sensing
+---
+# GeoRSCLIP-ViT-H-14
+This model is a mirror/redistribution of the original [GeoRSCLIP](https://huggingface.co/Zilun/GeoRSCLIP) model.
+## Original Repository and Links
+- **Original Hugging Face Model**: [Zilun/GeoRSCLIP](https://huggingface.co/Zilun/GeoRSCLIP)
+- **Official GitHub Repository**: [om-ai-lab/RS5M](https://github.com/om-ai-lab/RS5M)
+## Description
+GeoRSCLIP is a vision-language foundation model for remote sensing, trained on a large-scale dataset of remote sensing image-text pairs (RS5M). It is based on the CLIP architecture and is designed to handle the unique characteristics of remote sensing imagery.
+## Citation
+If you use this model in your research, please cite the original work:
+```bibtex
+@article{zhangRS5MGeoRSCLIPLargeScale2024,
+  title = {RS5M} and {GeoRSCLIP}: {A Large-Scale Vision-Language Dataset} and a {Large Vision-Language Model} for {Remote Sensing},
+  shorttitle = {RS5M} and {GeoRSCLIP},
+  author = {Zhang, Zilun and Zhao, Tiancheng and Guo, Yulong and Yin, Jianwei},
+  year = {2024},
+  journal = {TGRS},
+  volume = {62},
+  pages = {1--23},
+  issn = {1558-0644},
+  doi = {10.1109/TGRS.2024.3449154},
+  urldate = {2024-12-15},
+  keywords = {Computational modeling,Data models,Domain VLM (DVLM),general VLM (GVLM),image-text paired dataset,Location awareness,parameter efficient tuning,Remote sensing,remote sensing (RS),RS cross-modal text-image retrieval (RSCTIR),semantic localization (SeLo),Semantics,Tuning,vision-language model (VLM),Visualization,zero-shot classification (ZSC)}
+}
+```

diffusers/text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 49406,
+  "dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 49407,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "projection_dim": 768,
+  "transformers_version": "5.0.0.dev0",
+  "vocab_size": 49408
+}

diffusers/text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:928493ea6f275b3e7a286fe7a003c888bc64478e044f8363f3270356a8ed919e
+size 492265168

transformers/config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "architectures": [
+    "CLIPModel"
+  ],
+  "dtype": "float32",
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "attention_dropout": 0.0,
+    "dropout": 0.0,
+    "dtype": "float32",
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 77,
+    "model_type": "clip_text_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "projection_dim": 768,
+    "vocab_size": 49408
+  },
+  "transformers_version": "5.0.0.dev0",
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "dropout": 0.0,
+    "dtype": "float32",
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1280,
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 5120,
+    "layer_norm_eps": 1e-05,
+    "model_type": "clip_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 32,
+    "patch_size": 14,
+    "projection_dim": 768
+  }
+}

transformers/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

transformers/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:827291db8a381538e2bf5b9508d08d8f2e899a0c4600e18f4ad88ce7554cae9e
+size 1710537716

transformers/processor_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "image_processor": {
+    "crop_size": {
+      "height": 224,
+      "width": 224
+    },
+    "do_center_crop": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_processor_type": "CLIPImageProcessor",
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "processor_class": "CLIPProcessor",
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "shortest_edge": 224
+    }
+  },
+  "processor_class": "CLIPProcessor"
+}

transformers/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

transformers/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformers/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 77,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "CLIPProcessor",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

transformers/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff