ValentineKRAFTON commited on Apr 1

Commit

acd771b

verified ·

1 Parent(s): 61289b0

initial commit

Browse files

Files changed (21) hide show

.gitattributes +1 -0
.gitignore +1 -0
LICENSE +191 -0
LICENSES/MIT-OpenAI-CLIP.txt +21 -0
LICENSES/MIT-OpenCLIP.txt +25 -0
NOTICE +60 -0
README.md +83 -0
assets/Raon-VisionEncoder-Gradient-Black.png +0 -0
assets/Raon-VisionEncoder-Gradient-White.png +0 -0
assets/photo.jpg +3 -0
config.json +40 -0
configuration_raonve.py +96 -0
modeling_raonve.py +235 -0
raon_vision_encoder/__init__.py +0 -0
raon_vision_encoder/clip.py +287 -0
raon_vision_encoder/constants.py +5 -0
raon_vision_encoder/timm_model.py +397 -0
raon_vision_encoder/tokenizer.py +193 -0
raon_vision_encoder/transform.py +44 -0
raon_vision_encoder/transformer.py +627 -0
raon_vision_encoder/utils.py +16 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/photo.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

LICENSE ADDED Viewed

	@@ -0,0 +1,191 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to the Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by the Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding any notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   Copyright 2024-2026 Raon Vision Team
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

LICENSES/MIT-OpenAI-CLIP.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 OpenAI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LICENSES/MIT-OpenCLIP.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+MIT License
+Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman,
+Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar,
+John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi,
+Ludwig Schmidt
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

NOTICE ADDED Viewed

	@@ -0,0 +1,60 @@

+raon-vision-encoder
+Copyright 2024-2026 Raon Vision Team
+This product includes software derived from the following projects:
+===============================================================================
+OpenCLIP
+https://github.com/mlfoundations/open_clip
+Licensed under the MIT License (see LICENSES/MIT-OpenCLIP.txt)
+Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman,
+Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar,
+John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi,
+Ludwig Schmidt
+Used in: model/ and train/ packages (LocCa, CLIP, loss, factory,
+transformer, data pipeline, training loop, etc.)
+===============================================================================
+OpenAI CLIP
+https://github.com/openai/CLIP
+Licensed under the MIT License (see LICENSES/MIT-OpenAI-CLIP.txt)
+Copyright (c) 2021 OpenAI
+Used in: model/tokenizer.py, model/bpe_simple_vocab_16e6.txt.gz
+===============================================================================
+Meta Platforms, Inc. (MAE / MoCo v3)
+Licensed under the MIT License via OpenCLIP
+Copyright (c) Meta Platforms, Inc. and affiliates
+Used in: model/pos_embed.py (sincos position embedding utilities)
+===============================================================================
+timm (pytorch-image-models)
+https://github.com/huggingface/pytorch-image-models
+Licensed under the Apache License 2.0
+Copyright (c) Ross Wightman
+Used in: model/transform.py (ResizeKeepRatio)
+===============================================================================
+References
+The following papers informed the design and implementation of features
+in this software. Code was independently implemented unless noted above.
+- CoCa: Yu et al., "CoCa: Contrastive Captioners are Image-Text Foundation Models", 2022
+- SigLIP: Zhai et al., "Sigmoid Loss for Language Image Pre-Training", 2023
+- SigLIP2: Tschannen et al., "SigLIP 2: Multilingual Vision-Language Encoders", 2025
+- DINO: Caron et al., "Emerging Properties in Self-Supervised Vision Transformers", 2021
+- DINOv2: Oquab et al., "DINOv2: Learning Robust Visual Features without Supervision", 2024
+- SILC: Naeem et al., "SILC: Improving Vision Language Pretraining with Self-Distillation", 2023
+- TIPS: Huang et al., "TIPS: Text-Image Pretraining with Spatial Awareness", 2024
+- Koleo: Sablayrolles et al., "Spreading vectors for similarity search", ICLR 2019
+- Gram Anchoring: Simeoni et al., "DINOv3", 2025 (independently implemented)
+- NaFlex: from SigLIP2 / PaLI (independently implemented in PyTorch)

README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+---
+library_name: transformers
+tags:
+- vision
+- image-text
+- clip
+- zero-shot
+---
+<div align="center">
+  <img class="block dark:hidden" src="assets/Raon-VisionEncoder-Gradient-Black.png" alt="Raon VisionEncoder" width="600">
+  <img class="hidden dark:block" src="assets/Raon-VisionEncoder-Gradient-White.png" alt="Raon VisionEncoder" width="600">
+</div>
+<p align="center">
+  <a href="https://www.krafton.ai/ko/"><img src="https://img.shields.io/badge/Homepage-KRAFTON%20AI-blue?style=flat&logo=google-chrome&logoColor=white" alt="Homepage"></a>
+  <br>
+  <a href="https://huggingface.co/KRAFTON"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-KRAFTON-yellow?style=flat" alt="Hugging Face"></a>
+  <a href="https://x.com/Krafton_AI"><img src="https://img.shields.io/badge/X-KRAFTON%20AI-white?style=flat&logo=x&logoColor=black" alt="X"></a>
+  <br>
+  <a href="https://www.apache.org/licenses/LICENSE-2.0"><img src="https://img.shields.io/badge/License-Apache%202.0-lightgrey?style=flat" alt="License"></a>
+</p>
+**Raon-VisionEncoder** is a 1.14B-parameter vision-language foundation model by [KRAFTON](https://www.krafton.com) for image and text feature extraction.
+It supports zero-shot image classification, image-text retrieval, and native aspect ratio inference via NaFlex.
+Built on [OpenCLIP](https://github.com/mlfoundations/open_clip) with a LocCa (Localized CoCa) architecture and ViT-SO400M vision encoder.
+## Pretrained Models
+| Model | Params (Inference) | Vision | Text | Patch Size | NaFlex Default Patches |
+|-------|--------------------|--------|------|------------|------------------------|
+| LocCa ViT-SO400M-16-SigLIP2 | 1.14B | 0.43B | 0.71B | 16x16 | 256 |
+## Requirements
+```bash
+pip install torch torchvision timm transformers huggingface-hub safetensors ftfy
+```
+## Quick Start
+```python
+import torch
+from transformers import AutoModel
+from PIL import Image
+# Load model + processor
+model = AutoModel.from_pretrained("KRAFTON/Raon-VisionEncoder", trust_remote_code=True)
+model = model.to(dtype=torch.bfloat16).eval()
+processor = model.get_processor("KRAFTON/Raon-VisionEncoder")
+# Encode image and text
+img_inputs = processor(images=Image.open("assets/photo.jpg"))
+txt_inputs = processor(text=["a cat", "a dog"])
+with torch.no_grad():
+    img_feat = model.encode_image(**img_inputs)
+    txt_feat = model.encode_text(**txt_inputs)
+    # Compute similarity with learned scale and bias
+    logits = model.logit_scale.exp() * (img_feat @ txt_feat.T) + model.logit_bias
+    probs = logits.softmax(dim=-1)
+    print(probs)
+```
+## API Reference
+| Method | Input | Output |
+|--------|-------|--------|
+| `model.encode_image(**inputs)` | Processor output (image) | `[B, 1152]` normalized image features |
+| `model.encode_text(**inputs)` | Processor output (text) | `[B, 1152]` normalized text features |
+| `model.logit_scale` | - | Learned temperature parameter |
+| `model.logit_bias` | - | Learned bias parameter |
+| `model.get_processor(repo_id)` | HuggingFace repo ID | Processor instance |
+| `processor(images=img)` | PIL Image | Preprocessed image dict |
+| `processor(text=["a cat"])` | list of strings | Tokenized text dict |
+## License
+This repository is licensed under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0).
+Third-party notices in [NOTICE](NOTICE).
+© 2026 KRAFTON

assets/Raon-VisionEncoder-Gradient-Black.png ADDED Viewed

assets/Raon-VisionEncoder-Gradient-White.png ADDED Viewed

assets/photo.jpg ADDED Viewed

Git LFS Details

SHA256: 63399d70c550f7e0fb3738d954f04918f0f4a2532e43cc6a1b90ac82d104ed16
Pointer size: 131 Bytes
Size of remote file: 281 kB

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "architectures": [
+    "RaonVEModel"
+  ],
+  "model_type": "raon_ve",
+  "auto_map": {
+    "AutoConfig": "configuration_raonve.RaonVEConfig",
+    "AutoModel": "modeling_raonve.RaonVEModel"
+  },
+  "embed_dim": 1152,
+  "init_logit_bias": -10,
+  "vision_config": {
+    "image_size": 256,
+    "timm_model_name": "vit_so400m_patch16_siglip_256",
+    "timm_model_pretrained": false,
+    "timm_pool": "map",
+    "timm_proj": "none"
+  },
+  "text_config": {
+    "context_length": 64,
+    "vocab_size": 256000,
+    "hf_tokenizer_name": "timm/ViT-SO400M-16-SigLIP2-256",
+    "tokenizer_kwargs": {
+      "clean": "canonicalize"
+    },
+    "width": 1152,
+    "heads": 16,
+    "layers": 27,
+    "mlp_ratio": 3.7362,
+    "no_causal_mask": true,
+    "proj_bias": true,
+    "pool_type": "last",
+    "norm_kwargs": {
+      "eps": 1e-06
+    },
+    "act_kwargs": {
+      "approximate": "tanh"
+    }
+  }
+}

configuration_raonve.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Raon-VisionEncoder configuration."""
+from transformers import PretrainedConfig
+class RaonVEVisionConfig(PretrainedConfig):
+    model_type = "raon_ve_vision"
+    def __init__(
+        self,
+        image_size=256,
+        timm_model_name="vit_so400m_patch16_siglip_256",
+        timm_model_pretrained=False,
+        timm_pool="map",
+        timm_proj="none",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.timm_model_name = timm_model_name
+        self.timm_model_pretrained = timm_model_pretrained
+        self.timm_pool = timm_pool
+        self.timm_proj = timm_proj
+class RaonVETextConfig(PretrainedConfig):
+    model_type = "raon_ve_text"
+    def __init__(
+        self,
+        context_length=64,
+        vocab_size=256000,
+        width=1152,
+        heads=16,
+        layers=27,
+        mlp_ratio=3.7362,
+        no_causal_mask=True,
+        proj_bias=True,
+        pool_type="last",
+        hf_tokenizer_name="timm/ViT-SO400M-16-SigLIP2-256",
+        tokenizer_kwargs=None,
+        norm_kwargs=None,
+        act_kwargs=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.context_length = context_length
+        self.vocab_size = vocab_size
+        self.width = width
+        self.heads = heads
+        self.layers = layers
+        self.mlp_ratio = mlp_ratio
+        self.no_causal_mask = no_causal_mask
+        self.proj_bias = proj_bias
+        self.pool_type = pool_type
+        self.hf_tokenizer_name = hf_tokenizer_name
+        self.tokenizer_kwargs = tokenizer_kwargs or {"clean": "canonicalize"}
+        self.norm_kwargs = norm_kwargs or {"eps": 1e-6}
+        self.act_kwargs = act_kwargs or {"approximate": "tanh"}
+class RaonVEConfig(PretrainedConfig):
+    model_type = "raon_ve"
+    is_composition = True
+    def __init__(
+        self,
+        embed_dim=1152,
+        init_logit_bias=-10,
+        vision_config=None,
+        text_config=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.init_logit_bias = init_logit_bias
+        if isinstance(vision_config, dict):
+            self.vision_config = RaonVEVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = RaonVEVisionConfig()
+        else:
+            self.vision_config = vision_config
+        if isinstance(text_config, dict):
+            self.text_config = RaonVETextConfig(**text_config)
+        elif text_config is None:
+            self.text_config = RaonVETextConfig()
+        else:
+            self.text_config = text_config
+    def to_dict(self):
+        output = super().to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        return output

modeling_raonve.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""Raon-VisionEncoder model."""
+import importlib
+import os
+import sys
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PreTrainedModel
+from .configuration_raonve import RaonVEConfig
+_raon_repo_id = None
+def set_repo_id(repo_id):
+    global _raon_repo_id
+    _raon_repo_id = repo_id
+def _ensure_raon_package():
+    """Import raon_vision_encoder, downloading from HF Hub if needed."""
+    try:
+        clip_mod = importlib.import_module("raon_vision_encoder.clip")
+        return clip_mod.CustomTextCLIP
+    except (ImportError, ModuleNotFoundError):
+        pass
+    from huggingface_hub import snapshot_download
+    repo_id = _raon_repo_id or "KRAFTON/Raon-VisionEncoder"
+    repo_dir = snapshot_download(repo_id, allow_patterns=["raon_vision_encoder/**"])
+    sys.path.insert(0, repo_dir)
+    for key in list(sys.modules.keys()):
+        if key.startswith("raon_vision_encoder"):
+            del sys.modules[key]
+    clip_mod = importlib.import_module("raon_vision_encoder.clip")
+    return clip_mod.CustomTextCLIP
+class RaonVEPreTrainedModel(PreTrainedModel):
+    config_class = RaonVEConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        pass
+class RaonVEModel(RaonVEPreTrainedModel):
+    config_class = RaonVEConfig
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        set_repo_id(str(pretrained_model_name_or_path))
+        return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+    def __init__(self, config: RaonVEConfig):
+        super().__init__(config)
+        vision_cfg = {
+            "image_size": config.vision_config.image_size,
+            "timm_model_name": config.vision_config.timm_model_name,
+            "timm_model_pretrained": config.vision_config.timm_model_pretrained,
+            "timm_pool": config.vision_config.timm_pool,
+            "timm_proj": config.vision_config.timm_proj,
+        }
+        text_cfg = {
+            "context_length": config.text_config.context_length,
+            "vocab_size": config.text_config.vocab_size,
+            "width": config.text_config.width,
+            "heads": config.text_config.heads,
+            "layers": config.text_config.layers,
+            "mlp_ratio": config.text_config.mlp_ratio,
+            "no_causal_mask": config.text_config.no_causal_mask,
+            "proj_bias": config.text_config.proj_bias,
+            "pool_type": config.text_config.pool_type,
+            "hf_tokenizer_name": config.text_config.hf_tokenizer_name,
+            "tokenizer_kwargs": config.text_config.tokenizer_kwargs,
+            "norm_kwargs": config.text_config.norm_kwargs,
+            "act_kwargs": config.text_config.act_kwargs,
+        }
+        CustomTextCLIP = _ensure_raon_package()
+        inner = CustomTextCLIP(
+            embed_dim=config.embed_dim,
+            vision_cfg=vision_cfg,
+            text_cfg=text_cfg,
+            init_logit_bias=config.init_logit_bias,
+        )
+        self.visual = inner.visual
+        self.text = inner.text
+        self.logit_scale = inner.logit_scale
+        self.logit_bias = inner.logit_bias
+        # Enable NaFlex by default
+        self.visual._setup_1d_forward()
+        self.post_init()
+    def encode_image(self, pixel_values, pixel_attention_mask=None, spatial_shapes=None):
+        """Encode images to normalized feature vectors [B, 1152].
+        Pass the output of processor(images=...) directly via **inputs.
+        """
+        kwargs = {}
+        if pixel_attention_mask is not None:
+            kwargs["patch_valid_mask"] = pixel_attention_mask
+        if spatial_shapes is not None:
+            kwargs["spatial_shapes"] = spatial_shapes
+        features = self.visual(pixel_values, **kwargs) if kwargs else self.visual(pixel_values)
+        return F.normalize(features, dim=-1)
+    def encode_text(self, input_ids):
+        """Encode text to normalized feature vectors [B, 1152].
+        Pass the output of processor(text=...) directly via **inputs.
+        """
+        features = self.text(input_ids)
+        return F.normalize(features, dim=-1)
+    def forward(self, pixel_values=None, input_ids=None, pixel_attention_mask=None, spatial_shapes=None):
+        image_features = None
+        text_features = None
+        if pixel_values is not None:
+            image_features = self.encode_image(
+                pixel_values,
+                pixel_attention_mask=pixel_attention_mask,
+                spatial_shapes=spatial_shapes,
+            )
+        if input_ids is not None:
+            text_features = self.encode_text(input_ids)
+        output = {
+            "image_features": image_features,
+            "text_features": text_features,
+            "logit_scale": self.logit_scale,
+            "logit_bias": self.logit_bias,
+        }
+        return output
+    @staticmethod
+    def get_processor(pretrained_model_name_or_path, **kwargs):
+        """Get the processor for this model."""
+        return RaonVEProcessor.from_pretrained(pretrained_model_name_or_path, **kwargs)
+class RaonVEProcessor:
+    """Image and text processor for Raon-VisionEncoder.
+    Preprocesses images into NaFlex patch sequences and tokenizes text.
+    Args:
+        max_num_patches: Maximum number of patches per image (controls resolution).
+            Higher values preserve more detail. Default: 256.
+    """
+    DEFAULT_MAX_PATCHES = 256
+    def __init__(self, patch_size=16, mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711), tokenizer=None):
+        from torchvision import transforms as T
+        self.patch_size = patch_size
+        self.mean, self.std = mean, std
+        self.tokenizer = tokenizer
+        self._post = T.Compose([T.ToTensor(), T.Normalize(mean=list(mean), std=list(std))])
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        import json
+        from pathlib import Path as _Path
+        if _Path(pretrained_model_name_or_path).is_dir():
+            cfg_path = _Path(pretrained_model_name_or_path) / "config.json"
+        else:
+            from huggingface_hub import hf_hub_download
+            cfg_path = hf_hub_download(pretrained_model_name_or_path, "config.json")
+        with open(cfg_path) as f:
+            cfg = json.load(f)
+        v = cfg.get("vision_config", {}); t = cfg.get("text_config", {})
+        ps = 16
+        for part in v.get("timm_model_name", "").split("_"):
+            if part.startswith("patch") and part[5:].isdigit():
+                ps = int(part[5:]); break
+        tokenizer = None
+        if t.get("hf_tokenizer_name"):
+            _ensure_raon_package()
+            tok_mod = importlib.import_module("raon_vision_encoder.tokenizer")
+            tokenizer = tok_mod.HFTokenizer(
+                t["hf_tokenizer_name"], context_length=t.get("context_length", 64),
+                tokenizer_mode=t.get("tokenizer_mode"), **t.get("tokenizer_kwargs", {}),
+            )
+        return cls(patch_size=ps, tokenizer=tokenizer)
+    def __call__(self, images=None, text=None, max_num_patches=None, return_tensors="pt"):
+        """Process images and/or text.
+        Args:
+            images: PIL Image or list of PIL Images.
+            text: String or list of strings.
+            max_num_patches: Resolution budget (default: 256). Higher = more detail.
+        Returns:
+            Dict with 'pixel_values', 'pixel_attention_mask', 'spatial_shapes' for images
+            and/or 'input_ids' for text.
+        """
+        from PIL import Image
+        result = {}
+        if images is not None:
+            mnp = max_num_patches or self.DEFAULT_MAX_PATCHES
+            _ensure_raon_package()
+            transform_mod = importlib.import_module("raon_vision_encoder.transform")
+            get_size = transform_mod.get_image_size_for_max_num_patches
+            imgs = [images] if isinstance(images, Image.Image) else images
+            ps = self.patch_size
+            all_p, all_m, all_s = [], [], []
+            for img in imgs:
+                img = img.convert("RGB")
+                w, h = img.size
+                th, tw = get_size(h, w, ps, mnp)
+                t = self._post(img.resize((tw, th), Image.BICUBIC))
+                gh, gw = th // ps, tw // ps
+                n = gh * gw
+                # [C, gh, ps, gw, ps] -> [gh, gw, C, ps, ps] -> [n, C*ps*ps]
+                patches = t.reshape(3, gh, ps, gw, ps).permute(1,3,0,2,4).reshape(n, 3*ps*ps)
+                padded = torch.zeros(mnp, ps*ps*3); padded[:n] = patches
+                mask = torch.zeros(mnp, dtype=torch.bool); mask[:n] = True
+                all_p.append(padded); all_m.append(mask)
+                all_s.append(torch.tensor([gh, gw]))
+            result["pixel_values"] = torch.stack(all_p)
+            result["pixel_attention_mask"] = torch.stack(all_m)
+            result["spatial_shapes"] = torch.stack(all_s)
+        if text is not None:
+            if self.tokenizer is None:
+                raise RuntimeError("Tokenizer not initialized.")
+            result["input_ids"] = self.tokenizer([text] if isinstance(text, str) else text)
+        return result

raon_vision_encoder/__init__.py ADDED Viewed

File without changes

raon_vision_encoder/clip.py ADDED Viewed

	@@ -0,0 +1,287 @@

+# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from functools import partial
+from .timm_model import TimmModel
+from .transformer import (
+    LayerNormFp32,
+    LayerNorm,
+    QuickGELU,
+    TextTransformer,
+    text_global_pool,
+)
+from .utils import to_2tuple
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    ls_init_value: Optional[float] = None
+    patch_dropout: float = 0.0
+    attentional_pool: bool = False
+    attn_pooler_queries: int = 256
+    attn_pooler_heads: int = 8
+    no_ln_pre: bool = False
+    pos_embed_type: str = "learnable"
+    final_ln_after_pool: bool = False
+    pool_type: str = "tok"
+    output_tokens: bool = False
+    act_kwargs: Optional[dict] = None
+    norm_kwargs: Optional[dict] = None
+    block_type: Optional[str] = None
+    qk_norm: bool = False
+    scaled_cosine_attn: bool = False
+    scale_heads: bool = False
+    scale_attn_inner: bool = False
+    scale_attn: bool = False
+    scale_fc: bool = False
+    timm_model_name: Optional[str] = None
+    timm_model_pretrained: bool = False
+    timm_pool: str = "avg"
+    timm_proj: str = "linear"
+    timm_proj_bias: bool = False
+    timm_drop: float = 0.0
+    timm_drop_path: Optional[float] = None
+    timm_use_rope: bool = False
+    timm_rope_keep_ape: bool = False
+    timm_dynamic_img_size: bool = False
+    timm_norm_pre: bool = False
+@dataclass
+class CLIPTextCfg:
+    context_length: int = 77
+    vocab_size: int = 49408
+    hf_tokenizer_name: Optional[str] = None
+    tokenizer_mode: Optional[str] = None
+    tokenizer_kwargs: Optional[dict] = None
+    width: int = 512
+    heads: int = 8
+    layers: int = 12
+    mlp_ratio: float = 4.0
+    ls_init_value: Optional[float] = None
+    embed_cls: bool = False
+    pad_id: int = 0
+    eos_id: int = 2
+    no_causal_mask: bool = False
+    final_ln_after_pool: bool = False
+    pool_type: str = "argmax"
+    proj_bias: bool = False
+    proj_type: str = "linear"
+    output_tokens: bool = False
+    act_kwargs: dict = None
+    norm_kwargs: dict = None
+    block_type: Optional[str] = None
+    qk_norm: bool = False
+    scaled_cosine_attn: bool = False
+    scale_heads: bool = False
+    scale_attn_inner: bool = False
+    scale_attn: bool = False
+    scale_fc: bool = False
+    hf_model_name: Optional[str] = None
+    hf_model_pretrained: bool = True
+    hf_proj_type: str = "mlp"
+    hf_pooler_type: str = "mean_pooler"
+def get_cast_dtype(precision: str):
+    cast_dtype = None
+    if precision == "bf16":
+        cast_dtype = torch.bfloat16
+    elif precision == "fp16":
+        cast_dtype = torch.float16
+    return cast_dtype
+def _build_vision_tower(
+    embed_dim: int,
+    vision_cfg: CLIPVisionCfg,
+    quick_gelu: bool = False,
+    cast_dtype: Optional[torch.dtype] = None,
+):
+    if isinstance(vision_cfg, dict):
+        vision_cfg = CLIPVisionCfg(**vision_cfg)
+    if not vision_cfg.timm_model_name:
+        raise ValueError(
+            "Only TimmModel-based vision towers are supported in raon-vision-encoder. "
+            "Please set timm_model_name in vision_cfg."
+        )
+    visual = TimmModel(
+        vision_cfg.timm_model_name,
+        pretrained=vision_cfg.timm_model_pretrained,
+        pool=vision_cfg.timm_pool,
+        proj=vision_cfg.timm_proj,
+        proj_bias=vision_cfg.timm_proj_bias,
+        drop=vision_cfg.timm_drop,
+        drop_path=vision_cfg.timm_drop_path,
+        patch_drop=vision_cfg.patch_dropout if vision_cfg.patch_dropout > 0 else None,
+        init_values=vision_cfg.ls_init_value,
+        qk_norm=vision_cfg.qk_norm,
+        use_rope=vision_cfg.timm_use_rope,
+        rope_keep_ape=vision_cfg.timm_rope_keep_ape,
+        dynamic_img_size=vision_cfg.timm_dynamic_img_size,
+        norm_pre=vision_cfg.timm_norm_pre,
+        embed_dim=embed_dim,
+        image_size=vision_cfg.image_size,
+        output_tokens=vision_cfg.output_tokens,
+    )
+    return visual
+def _build_text_tower(
+    embed_dim: int,
+    text_cfg: CLIPTextCfg,
+    quick_gelu: bool = False,
+    cast_dtype: Optional[torch.dtype] = None,
+):
+    if isinstance(text_cfg, dict):
+        text_cfg = CLIPTextCfg(**text_cfg)
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    norm_layer = (
+        LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+    )
+    if text_cfg.norm_kwargs:
+        norm_layer = partial(norm_layer, **text_cfg.norm_kwargs)
+    if text_cfg.act_kwargs is not None:
+        act_layer = partial(act_layer, **text_cfg.act_kwargs)
+    text = TextTransformer(
+        context_length=text_cfg.context_length,
+        vocab_size=text_cfg.vocab_size,
+        width=text_cfg.width,
+        heads=text_cfg.heads,
+        layers=text_cfg.layers,
+        mlp_ratio=text_cfg.mlp_ratio,
+        ls_init_value=text_cfg.ls_init_value,
+        output_dim=embed_dim,
+        embed_cls=text_cfg.embed_cls,
+        no_causal_mask=text_cfg.no_causal_mask,
+        pad_id=text_cfg.pad_id,
+        eos_id=text_cfg.eos_id,
+        pool_type=text_cfg.pool_type,
+        proj_type=text_cfg.proj_type,
+        proj_bias=text_cfg.proj_bias,
+        output_tokens=text_cfg.output_tokens,
+        act_layer=act_layer,
+        norm_layer=norm_layer,
+        block_type=text_cfg.block_type,
+        qk_norm=text_cfg.qk_norm,
+        scaled_cosine_attn=text_cfg.scaled_cosine_attn,
+        scale_heads=text_cfg.scale_heads,
+        scale_attn_inner=text_cfg.scale_attn_inner,
+        scale_attn=text_cfg.scale_attn,
+        scale_fc=text_cfg.scale_fc,
+    )
+    return text
+class CustomTextCLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+        self,
+        embed_dim: int,
+        vision_cfg: CLIPVisionCfg,
+        text_cfg: CLIPTextCfg,
+        quick_gelu: bool = False,
+        init_logit_scale: float = np.log(1 / 0.07),
+        init_logit_bias: Optional[float] = None,
+        nonscalar_logit_scale: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+        output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
+        self.context_length = self.text.context_length
+        self.vocab_size = self.text.vocab_size
+        lshape = [1] if nonscalar_logit_scale else []
+        self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
+        else:
+            self.logit_bias = None
+    def encode_image(
+        self, pixel_values, normalize: bool = False, pixel_attention_mask=None, spatial_shapes=None
+    ):
+        kwargs = {}
+        if pixel_attention_mask is not None:
+            kwargs["patch_valid_mask"] = pixel_attention_mask
+        if spatial_shapes is not None:
+            kwargs["spatial_shapes"] = spatial_shapes
+        features = self.visual(pixel_values, **kwargs) if kwargs else self.visual(pixel_values)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, input_ids, normalize: bool = False):
+        features = self.text(input_ids)
+        return F.normalize(features, dim=-1) if normalize else features
+    def get_logits(self, image, text):
+        image_features = self.encode_image(pixel_values=image, normalize=True)
+        text_features = self.encode_text(input_ids=text, normalize=True)
+        image_logits = self.logit_scale.exp() * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+    def forward(
+        self, image=None, text=None, patch_valid_mask=None, spatial_shapes=None
+    ):
+        image_features = (
+            self.encode_image(
+                pixel_values=image,
+                normalize=True,
+                pixel_attention_mask=patch_valid_mask,
+                spatial_shapes=spatial_shapes,
+            )
+            if image is not None
+            else None
+        )
+        text_features = (
+            self.encode_text(input_ids=text, normalize=True) if text is not None else None
+        )
+        if self.output_dict:
+            out_dict = {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp(),
+            }
+            if self.logit_bias is not None:
+                out_dict["logit_bias"] = self.logit_bias
+            return out_dict
+        if self.logit_bias is not None:
+            return (
+                image_features,
+                text_features,
+                self.logit_scale.exp(),
+                self.logit_bias,
+            )
+        return image_features, text_features, self.logit_scale.exp()

raon_vision_encoder/constants.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+INCEPTION_MEAN = (0.5, 0.5, 0.5)
+INCEPTION_STD = (0.5, 0.5, 0.5)

raon_vision_encoder/timm_model.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)
+import logging
+import types
+from collections import OrderedDict
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+try:
+    import timm
+    from timm.layers import RotAttentionPool2d
+    from timm.layers import AttentionPool2d as AbsAttentionPool2d
+    from timm.layers import Mlp, to_2tuple
+    from timm.layers import AttentionRope, RotaryEmbeddingCat
+except ImportError:
+    timm = None
+class TimmModel(nn.Module):
+    """timm model adapter"""
+    def __init__(
+        self,
+        model_name: str,
+        embed_dim: int,
+        image_size: Union[int, Tuple[int, int]] = 224,
+        pool: str = "avg",
+        proj: str = "linear",
+        proj_bias: bool = False,
+        drop: float = 0.0,
+        drop_path: Optional[float] = None,
+        patch_drop: Optional[float] = None,
+        init_values: Optional[float] = None,
+        qk_norm: bool = False,
+        use_rope: bool = False,
+        rope_keep_ape: bool = False,
+        dynamic_img_size: bool = False,
+        norm_pre: bool = False,
+        pretrained: bool = False,
+        output_tokens: bool = False,
+    ):
+        super().__init__()
+        if timm is None:
+            raise RuntimeError(
+                "Please install the latest timm (`pip install timm`) to use timm based models."
+            )
+        self.image_size = to_2tuple(image_size)
+        self.output_tokens = output_tokens
+        timm_kwargs = {}
+        if drop_path is not None:
+            timm_kwargs["drop_path_rate"] = drop_path
+        if patch_drop is not None:
+            timm_kwargs["patch_drop_rate"] = patch_drop
+        if init_values is not None:
+            timm_kwargs["init_values"] = init_values
+        if qk_norm:
+            timm_kwargs["qk_norm"] = True
+        if dynamic_img_size:
+            timm_kwargs["dynamic_img_size"] = True
+        if use_rope:
+            class _AttentionRopeNoPrefix(AttentionRope):
+                """AttentionRope with num_prefix_tokens=0 for models without cls token."""
+                def __init__(self, *args, **kwargs):
+                    kwargs["num_prefix_tokens"] = 0
+                    super().__init__(*args, **kwargs)
+            timm_kwargs["attn_layer"] = _AttentionRopeNoPrefix
+            if not rope_keep_ape:
+                timm_kwargs["pos_embed"] = "none"
+        custom_pool = pool in ("abs_attn", "rot_attn")
+        if proj:
+            assert proj in ("linear", "mlp", "none")
+        extra_proj = proj in ("linear", "mlp")
+        if not extra_proj and not custom_pool:
+            proj_dim = 0 if proj == "none" else embed_dim
+            self.trunk = timm.create_model(
+                model_name,
+                num_classes=proj_dim,
+                global_pool=pool,
+                pretrained=pretrained,
+                **timm_kwargs,
+            )
+            prev_chs = embed_dim
+        else:
+            self.trunk = timm.create_model(
+                model_name,
+                pretrained=pretrained,
+                **timm_kwargs,
+            )
+            feat_size = self.trunk.default_cfg.get("pool_size", None)
+            feature_ndim = 1 if not feat_size else 2
+            if custom_pool:
+                assert feature_ndim == 2
+                self.trunk.reset_classifier(0, global_pool="")
+            else:
+                reset_kwargs = dict(global_pool=pool) if pool else {}
+                self.trunk.reset_classifier(0, **reset_kwargs)
+            prev_chs = self.trunk.num_features
+        head_layers = OrderedDict()
+        if pool == "abs_attn":
+            head_layers["pool"] = AbsAttentionPool2d(
+                prev_chs, feat_size=feat_size, out_features=embed_dim
+            )
+            prev_chs = embed_dim
+        elif pool == "rot_attn":
+            head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
+            prev_chs = embed_dim
+        if proj == "linear":
+            head_layers["drop"] = nn.Dropout(drop)
+            head_layers["proj"] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
+        elif proj == "mlp":
+            head_layers["mlp"] = Mlp(
+                prev_chs,
+                2 * embed_dim,
+                embed_dim,
+                drop=(drop, 0),
+                bias=(True, proj_bias),
+            )
+        self.head = nn.Sequential(head_layers)
+        if (
+            norm_pre
+            and hasattr(self.trunk, "norm_pre")
+            and isinstance(self.trunk.norm_pre, nn.Identity)
+        ):
+            self.trunk.norm_pre = nn.LayerNorm(self.trunk.embed_dim)
+            logging.info(
+                f"Replaced norm_pre Identity with LayerNorm({self.trunk.embed_dim})"
+            )
+        self._has_rope = use_rope
+        if use_rope:
+            self._setup_rope()
+    def _setup_rope(self):
+        """Inject 2D Rotary Position Embedding into the timm trunk."""
+        num_heads = self.trunk.blocks[0].attn.num_heads
+        head_dim = self.trunk.embed_dim // num_heads
+        self.trunk.patch_embed.strict_img_size = False
+        self.rope = RotaryEmbeddingCat(
+            dim=head_dim,
+            max_res=max(self.image_size),
+            in_pixels=True,
+        )
+        def _block_forward_rope(block_self, x, rope=None, attn_mask=None):
+            x = x + block_self.drop_path1(
+                block_self.ls1(
+                    block_self.attn(block_self.norm1(x), rope=rope, attn_mask=attn_mask)
+                )
+            )
+            x = x + block_self.drop_path2(
+                block_self.ls2(block_self.mlp(block_self.norm2(x)))
+            )
+            return x
+        for blk in self.trunk.blocks:
+            blk.forward = types.MethodType(_block_forward_rope, blk)
+        timm_model_ref = self
+        _num_prefix = getattr(self.trunk, "num_prefix_tokens", 0)
+        def _forward_features_rope(trunk_self, x, attn_mask=None):
+            from torch.utils.checkpoint import checkpoint
+            from timm.layers import resample_abs_pos_embed
+            ps = trunk_self.patch_embed.patch_size
+            grid_shape = [x.shape[2] // ps[0], x.shape[3] // ps[1]]
+            x = trunk_self.patch_embed(x)
+            if x.ndim == 4:
+                x = x.reshape(x.shape[0], -1, x.shape[-1])
+            if hasattr(trunk_self, "pos_embed") and trunk_self.pos_embed is not None:
+                if x.shape[1] != trunk_self.pos_embed.shape[1]:
+                    x = x + resample_abs_pos_embed(
+                        trunk_self.pos_embed, grid_shape, num_prefix_tokens=_num_prefix
+                    )
+                else:
+                    x = x + trunk_self.pos_embed
+            x = trunk_self.pos_drop(x)
+            x = trunk_self.norm_pre(x)
+            rot_pos_embed = timm_model_ref.rope.get_embed(shape=grid_shape)
+            _sdpa_mask = None
+            if attn_mask is not None:
+                _sdpa_mask = torch.zeros_like(attn_mask, dtype=x.dtype)
+                _sdpa_mask.masked_fill_(~attn_mask, float("-inf"))
+                _sdpa_mask = _sdpa_mask.unsqueeze(1).unsqueeze(2)
+            for blk in trunk_self.blocks:
+                if trunk_self.grad_checkpointing and not torch.jit.is_scripting():
+                    x = checkpoint(
+                        blk,
+                        x,
+                        rope=rot_pos_embed,
+                        attn_mask=_sdpa_mask,
+                        use_reentrant=False,
+                    )
+                else:
+                    x = blk(x, rope=rot_pos_embed, attn_mask=_sdpa_mask)
+            x = trunk_self.norm(x)
+            return x
+        self.trunk.forward_features = types.MethodType(
+            _forward_features_rope, self.trunk
+        )
+    def _setup_dynamic_pos_embed(self):
+        """Patch forward_features for variable-resolution pos_embed interpolation (non-RoPE)."""
+        self.trunk.patch_embed.strict_img_size = False
+        _num_prefix = getattr(self.trunk, "num_prefix_tokens", 0)
+        def _forward_features_dynamic(trunk_self, x, patch_valid_mask=None):
+            from torch.utils.checkpoint import checkpoint
+            from timm.layers import resample_abs_pos_embed
+            ps = trunk_self.patch_embed.patch_size
+            grid_shape = [x.shape[2] // ps[0], x.shape[3] // ps[1]]
+            x = trunk_self.patch_embed(x)
+            if x.ndim == 4:
+                x = x.reshape(x.shape[0], -1, x.shape[-1])
+            if hasattr(trunk_self, "pos_embed") and trunk_self.pos_embed is not None:
+                if x.shape[1] != trunk_self.pos_embed.shape[1]:
+                    x = x + resample_abs_pos_embed(
+                        trunk_self.pos_embed, grid_shape, num_prefix_tokens=_num_prefix
+                    )
+                else:
+                    x = x + trunk_self.pos_embed
+            x = trunk_self.pos_drop(x)
+            x = trunk_self.norm_pre(x)
+            _sdpa_mask = None
+            if patch_valid_mask is not None:
+                _sdpa_mask = torch.zeros_like(patch_valid_mask, dtype=x.dtype)
+                _sdpa_mask.masked_fill_(~patch_valid_mask, float("-inf"))
+                _sdpa_mask = _sdpa_mask.unsqueeze(1).unsqueeze(2)
+            for blk in trunk_self.blocks:
+                if trunk_self.grad_checkpointing and not torch.jit.is_scripting():
+                    if _sdpa_mask is not None:
+                        x = checkpoint(
+                            blk, x, attn_mask=_sdpa_mask, use_reentrant=False
+                        )
+                    else:
+                        x = checkpoint(blk, x, use_reentrant=False)
+                else:
+                    x = blk(x, attn_mask=_sdpa_mask)
+            x = trunk_self.norm(x)
+            return x
+        self.trunk.forward_features = types.MethodType(
+            _forward_features_dynamic, self.trunk
+        )
+    def _setup_1d_forward(self):
+        """Patch forward_features for NaFlex 1D mode (SigLIP2 style)."""
+        _num_prefix = getattr(self.trunk, "num_prefix_tokens", 0)
+        def _forward_features_1d(
+            trunk_self, x, patch_valid_mask=None, spatial_shapes=None
+        ):
+            from torch.utils.checkpoint import checkpoint
+            conv = trunk_self.patch_embed.proj
+            D = conv.weight.shape[0]
+            x = torch.nn.functional.linear(
+                x.to(conv.weight.dtype), conv.weight.reshape(D, -1), conv.bias
+            )
+            if (
+                hasattr(trunk_self, "pos_embed")
+                and trunk_self.pos_embed is not None
+                and spatial_shapes is not None
+            ):
+                pos_embed = trunk_self.pos_embed
+                base_n = pos_embed.shape[1]
+                base_grid = int(base_n**0.5)
+                pos_2d = (
+                    pos_embed.reshape(1, base_grid, base_grid, -1)
+                    .permute(0, 3, 1, 2)
+                    .float()
+                )
+                B, sl, D_emb = x.shape
+                pos_resized = torch.zeros(B, sl, D_emb, device=x.device, dtype=x.dtype)
+                for i in range(B):
+                    gh, gw = spatial_shapes[i].tolist()
+                    pe = torch.nn.functional.interpolate(
+                        pos_2d, size=(gh, gw), mode="bilinear", align_corners=False
+                    )
+                    pe = pe.squeeze(0).permute(1, 2, 0).reshape(gh * gw, -1).to(x.dtype)
+                    n_patches = gh * gw
+                    pos_resized[i, :n_patches] = pe
+                    if n_patches < sl:
+                        pos_resized[i, n_patches:] = pe[0]
+                x = x + pos_resized
+            elif hasattr(trunk_self, "pos_embed") and trunk_self.pos_embed is not None:
+                x = x + trunk_self.pos_embed
+            x = trunk_self.pos_drop(x)
+            x = trunk_self.norm_pre(x)
+            _sdpa_mask = None
+            if patch_valid_mask is not None:
+                _sdpa_mask = torch.zeros_like(patch_valid_mask, dtype=x.dtype)
+                _sdpa_mask.masked_fill_(~patch_valid_mask, float("-inf"))
+                _sdpa_mask = _sdpa_mask.unsqueeze(1).unsqueeze(2)
+            for blk in trunk_self.blocks:
+                if trunk_self.grad_checkpointing and not torch.jit.is_scripting():
+                    if _sdpa_mask is not None:
+                        x = checkpoint(
+                            blk, x, attn_mask=_sdpa_mask, use_reentrant=False
+                        )
+                    else:
+                        x = checkpoint(blk, x, use_reentrant=False)
+                else:
+                    x = blk(x, attn_mask=_sdpa_mask)
+            x = trunk_self.norm(x)
+            return x
+        self.trunk._forward_features_1d = types.MethodType(
+            _forward_features_1d, self.trunk
+        )
+        self._has_1d_forward = True
+    def forward_patch_features(self, x):
+        """Forward pass returning per-patch features (before pooling/projection)."""
+        return self.trunk.forward_features(x)
+    def forward(self, x, patch_valid_mask=None, spatial_shapes=None):
+        if spatial_shapes is not None and getattr(self, "_has_1d_forward", False):
+            patch_features = self.trunk._forward_features_1d(
+                x, patch_valid_mask=patch_valid_mask, spatial_shapes=spatial_shapes
+            )
+        elif patch_valid_mask is not None and self._has_rope:
+            patch_features = self.trunk.forward_features(x, attn_mask=patch_valid_mask)
+        elif patch_valid_mask is not None:
+            patch_features = self.trunk.forward_features(
+                x, patch_valid_mask=patch_valid_mask
+            )
+        else:
+            patch_features = self.trunk.forward_features(x)
+        if patch_valid_mask is not None:
+            mask_f = patch_valid_mask.unsqueeze(-1).to(
+                patch_features.dtype
+            )
+            patch_features = patch_features * mask_f
+        self._cached_patch_features = patch_features
+        if (
+            patch_valid_mask is not None
+            and getattr(self.trunk, "global_pool", "") == "avg"
+        ):
+            pooled = patch_features.sum(dim=1) / mask_f.sum(dim=1).clamp(min=1)
+            pooled = (
+                self.trunk.fc_norm(pooled) if hasattr(self.trunk, "fc_norm") else pooled
+            )
+        elif (
+            patch_valid_mask is not None
+            and getattr(self.trunk, "attn_pool", None) is not None
+        ):
+            attn_mask = torch.zeros(
+                patch_valid_mask.shape,
+                dtype=patch_features.dtype,
+                device=patch_features.device,
+            )
+            attn_mask.masked_fill_(~patch_valid_mask.bool(), float("-inf"))
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)
+            pooled = self.trunk.attn_pool(patch_features, attn_mask=attn_mask)
+            pooled = (
+                self.trunk.fc_norm(pooled) if hasattr(self.trunk, "fc_norm") else pooled
+            )
+        else:
+            pooled = self.trunk.forward_head(patch_features)
+        pooled = self.head(pooled)
+        if self.output_tokens:
+            return pooled, patch_features
+        return pooled

raon_vision_encoder/tokenizer.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)
+import html
+import os
+import string
+from typing import List, Optional, Union
+import warnings
+try:
+    import ftfy
+except ImportError:
+    ftfy = None
+import torch
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+DEFAULT_CONTEXT_LENGTH = 77
+def basic_clean(text):
+    if ftfy is not None:
+        text = ftfy.fix_text(text)
+    else:
+        text
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = " ".join(text.split())
+    text = text.strip()
+    return text
+def _clean_canonicalize(x):
+    return canonicalize_text(basic_clean(x))
+def _clean_lower(x):
+    return whitespace_clean(basic_clean(x)).lower()
+def _clean_whitespace(x):
+    return whitespace_clean(basic_clean(x))
+def get_clean_fn(type: str):
+    if type == "canonicalize":
+        return _clean_canonicalize
+    elif type == "lower":
+        return _clean_lower
+    elif type == "whitespace":
+        return _clean_whitespace
+    else:
+        assert False, f"Invalid clean function ({type})."
+def canonicalize_text(
+    text,
+    *,
+    keep_punctuation_exact_string=None,
+    trans_punctuation: dict = str.maketrans("", "", string.punctuation),
+):
+    """Returns canonicalized `text` (lowercase and punctuation removed)."""
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(trans_punctuation)
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(trans_punctuation)
+    text = text.lower()
+    text = " ".join(text.split())
+    return text.strip()
+class HFTokenizer:
+    """HuggingFace tokenizer wrapper with support for custom tokenization modes"""
+    def __init__(
+        self,
+        tokenizer_name: str,
+        context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH,
+        clean: str = "whitespace",
+        strip_sep_token: bool = False,
+        language: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        tokenizer_mode: Optional[str] = None,
+        **kwargs,
+    ):
+        self.tokenizer_mode = tokenizer_mode or ""
+        self.context_length = context_length
+        self.clean_fn = get_clean_fn(clean)
+        self.strip_sep_token = strip_sep_token
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name, cache_dir=cache_dir, **kwargs
+        )
+        set_lang_fn = getattr(self.tokenizer, "set_src_lang_special_tokens", None)
+        if callable(set_lang_fn):
+            self.set_lang_fn = set_lang_fn
+        if language is not None:
+            self.set_language(language)
+    def save_pretrained(self, dest):
+        self.tokenizer.save_pretrained(dest)
+    def __call__(
+        self, texts: Union[str, List[str]], context_length: Optional[int] = None
+    ) -> torch.Tensor:
+        if isinstance(texts, str):
+            texts = [texts]
+        context_length = context_length or self.context_length
+        assert context_length, (
+            "Please set a valid context length in class init or call."
+        )
+        texts = [self.clean_fn(text) for text in texts]
+        if self.tokenizer_mode == "clips":
+            return self._clips_tokenize(texts, context_length)
+        else:
+            output = self.tokenizer(
+                texts,
+                return_tensors="pt",
+                max_length=context_length,
+                padding="max_length",
+                truncation=True,
+            )
+            input_ids = output.input_ids
+            if self.strip_sep_token:
+                input_ids = torch.where(
+                    input_ids == self.tokenizer.sep_token_id,
+                    torch.zeros_like(input_ids),
+                    input_ids,
+                )
+            return input_ids
+    def set_language(self, src_lang):
+        if hasattr(self, "set_lang_fn"):
+            self.set_lang_fn(src_lang)
+        else:
+            warnings.warn("Cannot set language for the tokenizer.")
+    def _clips_tokenize(self, texts: List[str], context_length: int) -> torch.Tensor:
+        encoded_outputs = self.tokenizer(
+            texts,
+            add_special_tokens=False,
+            padding=False,
+            truncation=False,
+            return_tensors=None,
+        )
+        encoded = []
+        for tokens in encoded_outputs["input_ids"]:
+            tokens = tokens[: context_length - 3]
+            tokens = (
+                [self.tokenizer.bos_token_id] + tokens + [self.tokenizer.eos_token_id]
+            )
+            encoded.append(tokens)
+        result = torch.zeros(len(encoded), context_length, dtype=torch.long)
+        for i, tokens in enumerate(encoded):
+            padded_tokens = self._pad_and_add_class_token(
+                tokens,
+                max_length=context_length,
+                pad_token_id=self.tokenizer.pad_token_id,
+                cls_token_id=self.tokenizer.cls_token_id,
+            )
+            result[i, : len(padded_tokens)] = torch.tensor(padded_tokens)
+        return result
+    def _pad_and_add_class_token(
+        self,
+        tokens: List[int],
+        max_length: int,
+        pad_token_id: int = 0,
+        cls_token_id: int = 101,
+    ) -> List[int]:
+        if len(tokens) > max_length - 1:
+            tokens = tokens[: max_length - 1]
+        if len(tokens) < max_length - 1:
+            tokens = tokens + [pad_token_id] * (max_length - 1 - len(tokens))
+        tokens = tokens + [cls_token_id]
+        return tokens

raon_vision_encoder/transform.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)
+import math
+def get_image_size_for_max_num_patches(
+    image_height, image_width, patch_size, max_num_patches
+):
+    """Find target image size preserving aspect ratio within patch budget.
+    Uses binary search to find the optimal scale such that
+    ceil(h*scale/ps)*ceil(w*scale/ps) <= max_num_patches.
+    Args:
+        image_height: Original image height.
+        image_width: Original image width.
+        patch_size: Patch size (int).
+        max_num_patches: Maximum number of patches allowed.
+    Returns:
+        (target_h, target_w) both multiples of patch_size.
+    """
+    scale_min, scale_max = 1e-6, 100.0
+    eps = 1e-5
+    while (scale_max - scale_min) >= eps:
+        scale = (scale_min + scale_max) / 2
+        target_h = max(
+            patch_size, int(math.ceil(image_height * scale / patch_size) * patch_size)
+        )
+        target_w = max(
+            patch_size, int(math.ceil(image_width * scale / patch_size) * patch_size)
+        )
+        num_patches = (target_h // patch_size) * (target_w // patch_size)
+        if num_patches <= max_num_patches:
+            scale_min = scale
+        else:
+            scale_max = scale
+    target_h = max(
+        patch_size, int(math.ceil(image_height * scale_min / patch_size) * patch_size)
+    )
+    target_w = max(
+        patch_size, int(math.ceil(image_width * scale_min / patch_size) * patch_size)
+    )
+    return target_h, target_w

raon_vision_encoder/transformer.py ADDED Viewed

	@@ -0,0 +1,627 @@

+# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)
+from collections import OrderedDict
+import math
+from typing import Callable, Optional, Type, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint
+class LayerNormFp32(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(
+            x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps
+        )
+        return x.to(orig_type)
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm (with cast back to input dtype)."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+class QuickGELU(nn.Module):
+    # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        scaled_cosine: bool = False,
+        scale_heads: bool = False,
+        inner_norm: bool = False,
+        logit_scale_max: float = math.log(1.0 / 0.01),
+        norm_layer: Type[nn.Module] = LayerNormFp32,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ):
+        super().__init__()
+        assert not (scaled_cosine and qk_norm), (
+            "Cannot activate both scaled cosine and QK normalization"
+        )
+        self.scaled_cosine = scaled_cosine
+        self.scale_heads = scale_heads
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.logit_scale_max = logit_scale_max
+        self.use_fsdpa = hasattr(nn.functional, "scaled_dot_product_attention")
+        self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
+        if qkv_bias:
+            self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
+        else:
+            self.in_proj_bias = None
+        if qk_norm:
+            self.ln_q = norm_layer(self.head_dim)
+            self.ln_k = norm_layer(self.head_dim)
+        else:
+            self.ln_q = nn.Identity()
+            self.ln_k = nn.Identity()
+        if self.scaled_cosine:
+            self.logit_scale = nn.Parameter(
+                torch.log(10 * torch.ones((num_heads, 1, 1)))
+            )
+        else:
+            self.logit_scale = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        if self.scale_heads:
+            self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
+        else:
+            self.head_scale = None
+        if inner_norm:
+            self.ln_inner = norm_layer(dim)
+        else:
+            self.ln_inner = nn.Identity()
+        self.out_proj = nn.Linear(dim, dim)
+        self.out_drop = nn.Dropout(proj_drop)
+    def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
+        N, L, C = x.shape
+        q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
+        q = q.reshape(N, L, self.num_heads, -1).transpose(1, 2)
+        k = k.reshape(N, L, self.num_heads, -1).transpose(1, 2)
+        v = v.reshape(N, L, self.num_heads, -1).transpose(1, 2)
+        if attn_mask is not None:
+            if attn_mask.ndim == 3:
+                attn_mask = attn_mask.reshape(N, self.num_heads, L, L)
+            if attn_mask.dtype == torch.bool:
+                new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
+                new_attn_mask.masked_fill_(attn_mask, float("-inf"))
+                attn_mask = new_attn_mask
+            else:
+                attn_mask = attn_mask.to(dtype=q.dtype)
+        if self.logit_scale is not None:
+            attn = torch.bmm(
+                F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2)
+            )
+            logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
+            attn = attn * logit_scale
+            if attn_mask is not None:
+                attn = attn + attn_mask
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = torch.bmm(attn, v)
+        else:
+            q = self.ln_q(q)
+            k = self.ln_k(k)
+            if self.use_fsdpa:
+                x = F.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attn_mask,
+                    dropout_p=self.attn_drop.p if self.training else 0.0,
+                )
+            else:
+                q = q * self.scale
+                attn = torch.bmm(q, k.transpose(-1, -2))
+                if attn_mask is not None:
+                    attn += attn_mask
+                attn = attn.softmax(dim=-1)
+                attn = self.attn_drop(attn)
+                x = torch.bmm(attn, v)
+        if self.head_scale is not None:
+            x = x * self.head_scale
+        x = x.transpose(1, 2).reshape(N, L, C)
+        x = self.ln_inner(x)
+        x = self.out_proj(x)
+        x = self.out_drop(x)
+        return x
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = LayerNorm,
+        is_cross_attention: bool = False,
+        batch_first: bool = True,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        self.attn = nn.MultiheadAttention(d_model, n_head, batch_first=batch_first)
+        self.ls_1 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, mlp_width)),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(mlp_width, d_model)),
+                ]
+            )
+        )
+        self.ls_2 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+    def get_weight_dtype(self) -> torch.dtype:
+        if hasattr(self.mlp.c_fc, "int8_original_dtype"):
+            return self.mlp.c_fc.int8_original_dtype
+        return self.mlp.c_fc.weight.dtype
+    def attention(
+        self,
+        q_x: torch.Tensor,
+        k_x: Optional[torch.Tensor] = None,
+        v_x: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(
+            q_x,
+            k_x,
+            v_x,
+            need_weights=False,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+        )[0]
+    def forward(
+        self,
+        q_x: torch.Tensor,
+        k_x: Optional[torch.Tensor] = None,
+        v_x: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = (
+            self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        )
+        v_x = (
+            self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+        )
+        x = q_x + self.ls_1(
+            self.attention(
+                q_x=self.ln_1(q_x),
+                k_x=k_x,
+                v_x=v_x,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask,
+            )
+        )
+        x = x + self.ls_2(self.mlp(self.ln_2(x)))
+        return x
+class CustomResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Type[nn.Module] = LayerNorm,
+        qk_norm: bool = False,
+        scale_cosine_attn: bool = False,
+        scale_heads: bool = False,
+        scale_attn_inner: bool = False,
+        scale_attn: bool = False,
+        scale_fc: bool = False,
+        batch_first: bool = True,
+    ):
+        super().__init__()
+        assert batch_first, "batch_first must be True for CustomResidualAttentionBlock"
+        self.ln_1 = norm_layer(d_model)
+        self.attn = Attention(
+            d_model,
+            n_head,
+            qk_norm=qk_norm,
+            scaled_cosine=scale_cosine_attn,
+            scale_heads=scale_heads,
+            inner_norm=scale_attn_inner,
+            norm_layer=norm_layer,
+        )
+        self.ln_attn = norm_layer(d_model) if scale_attn else nn.Identity()
+        self.ls_1 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, mlp_width)),
+                    ("gelu", act_layer()),
+                    ("ln", norm_layer(mlp_width) if scale_fc else nn.Identity()),
+                    ("c_proj", nn.Linear(mlp_width, d_model)),
+                ]
+            )
+        )
+        self.ls_2 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+    def get_weight_dtype(self) -> torch.dtype:
+        if hasattr(self.mlp.c_fc, "int8_original_dtype"):
+            return self.mlp.c_fc.int8_original_dtype
+        return self.mlp.c_fc.weight.dtype
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        x = x + self.ls_1(self.ln_attn(self.attn(self.ln_1(x), attn_mask=attn_mask)))
+        x = x + self.ls_2(self.mlp(self.ln_2(x)))
+        return x
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Type[nn.Module] = LayerNorm,
+        batch_first: bool = True,
+        block_type: Optional[str] = None,
+        qk_norm: bool = False,
+        scaled_cosine_attn: bool = False,
+        scale_heads: bool = False,
+        scale_attn_inner: bool = False,
+        scale_attn: bool = False,
+        scale_fc: bool = False,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.batch_first = batch_first
+        self.grad_checkpointing = False
+        if block_type is None:
+            if any(
+                [
+                    qk_norm,
+                    scaled_cosine_attn,
+                    scale_heads,
+                    scale_attn_inner,
+                    scale_attn,
+                    scale_fc,
+                ]
+            ):
+                block_type = "custom"
+            else:
+                block_type = "default"
+        if block_type == "custom":
+            self.resblocks = nn.ModuleList(
+                [
+                    CustomResidualAttentionBlock(
+                        width,
+                        heads,
+                        mlp_ratio,
+                        ls_init_value=ls_init_value,
+                        act_layer=act_layer,
+                        norm_layer=norm_layer,
+                        qk_norm=qk_norm,
+                        scale_cosine_attn=scaled_cosine_attn,
+                        scale_heads=scale_heads,
+                        scale_attn_inner=scale_attn_inner,
+                        scale_attn=scale_attn,
+                        scale_fc=scale_fc,
+                        batch_first=batch_first,
+                    )
+                    for _ in range(layers)
+                ]
+            )
+        else:
+            self.resblocks = nn.ModuleList(
+                [
+                    ResidualAttentionBlock(
+                        width,
+                        heads,
+                        mlp_ratio,
+                        ls_init_value=ls_init_value,
+                        act_layer=act_layer,
+                        norm_layer=norm_layer,
+                        batch_first=batch_first,
+                    )
+                    for _ in range(layers)
+                ]
+            )
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].get_weight_dtype()
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        if not self.batch_first:
+            x = x.transpose(0, 1).contiguous()
+        for r in self.resblocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, None, None, attn_mask, use_reentrant=False)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        if not self.batch_first:
+            x = x.transpose(0, 1)
+        return x
+def _expand_token(token, batch_size: int):
+    return token.view(1, 1, -1).expand(batch_size, -1, -1)
+def text_global_pool(
+    x: torch.Tensor,
+    text: Optional[torch.Tensor] = None,
+    pool_type: str = "argmax",
+    eos_token_id: Optional[int] = None,
+) -> torch.Tensor:
+    if pool_type == "first":
+        pooled = x[:, 0]
+    elif pool_type == "last":
+        pooled = x[:, -1]
+    elif pool_type == "argmax":
+        assert text is not None
+        pooled = x[torch.arange(x.shape[0], device=x.device), text.argmax(dim=-1)]
+    elif pool_type == "eos":
+        assert text is not None
+        assert eos_token_id is not None
+        idx = (text == eos_token_id).int().argmax(dim=-1)
+        pooled = x[torch.arange(x.shape[0], device=x.device), idx]
+    else:
+        pooled = x
+    return pooled
+class TextTransformer(nn.Module):
+    output_tokens: torch.jit.Final[bool]
+    def __init__(
+        self,
+        context_length: int = 77,
+        vocab_size: int = 49408,
+        width: int = 512,
+        heads: int = 8,
+        layers: int = 12,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        output_dim: Optional[int] = 512,
+        embed_cls: bool = False,
+        no_causal_mask: bool = False,
+        use_pad_mask: bool = False,
+        correct_cls_mask: bool = False,
+        pad_id: int = 0,
+        eos_id: int = 2,
+        pool_type: str = "argmax",
+        proj_type: str = "linear",
+        proj_bias: bool = False,
+        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: Type[nn.Module] = LayerNorm,
+        output_tokens: bool = False,
+        block_type: Optional[str] = None,
+        qk_norm: bool = False,
+        scaled_cosine_attn: bool = False,
+        scale_heads: bool = False,
+        scale_attn_inner: bool = False,
+        scale_attn: bool = False,
+        scale_fc: bool = False,
+    ):
+        super().__init__()
+        assert pool_type in ("first", "last", "argmax", "eos", "none")
+        self.output_tokens = output_tokens
+        self.num_pos = self.context_length = context_length
+        self.vocab_size = vocab_size
+        self.width = width
+        self.output_dim = output_dim
+        self.heads = heads
+        self.pad_id = pad_id
+        self.eos_id = eos_id
+        self.pool_type = pool_type
+        self.use_pad_mask = use_pad_mask and no_causal_mask
+        self.correct_cls_mask = correct_cls_mask
+        self.token_embedding = nn.Embedding(vocab_size, width)
+        if embed_cls:
+            self.cls_emb = nn.Parameter(torch.empty(width))
+            self.num_pos += 1
+        else:
+            self.cls_emb = None
+        self.positional_embedding = nn.Parameter(torch.empty(self.num_pos, width))
+        self.transformer = Transformer(
+            width=width,
+            layers=layers,
+            heads=heads,
+            mlp_ratio=mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            block_type=block_type,
+            qk_norm=qk_norm,
+            scaled_cosine_attn=scaled_cosine_attn,
+            scale_heads=scale_heads,
+            scale_attn_inner=scale_attn_inner,
+            scale_attn=scale_attn,
+            scale_fc=scale_fc,
+        )
+        self.ln_final = norm_layer(width)
+        if no_causal_mask:
+            self.attn_mask = None
+        else:
+            self.register_buffer(
+                "attn_mask", self.build_causal_mask(), persistent=False
+            )
+        if proj_type == "none" or not output_dim:
+            self.text_projection = None
+        else:
+            if proj_bias:
+                self.text_projection = nn.Linear(width, output_dim)
+            else:
+                self.text_projection = nn.Parameter(torch.empty(width, output_dim))
+        self.init_parameters()
+    def init_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if self.cls_emb is not None:
+            nn.init.normal_(self.cls_emb, std=0.01)
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers) ** -0.5
+        )
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            if isinstance(self.text_projection, nn.Linear):
+                nn.init.normal_(
+                    self.text_projection.weight, std=self.transformer.width**-0.5
+                )
+                if self.text_projection.bias is not None:
+                    nn.init.zeros_(self.text_projection.bias)
+            else:
+                nn.init.normal_(self.text_projection, std=self.transformer.width**-0.5)
+    def build_causal_mask(self):
+        mask = torch.empty(self.num_pos, self.num_pos)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)
+        return mask
+    def _build_additive_mask(self, text, seq_len, dtype):
+        valid = text != self.pad_id
+        if self.cls_emb is not None:
+            cls_valid = valid.new_ones(valid.size(0), 1)
+            valid = torch.cat(
+                [valid, cls_valid] if self.correct_cls_mask else [cls_valid, valid], 1
+            )
+        key_mask = valid.unsqueeze(1).expand(-1, seq_len, -1)
+        additive = torch.zeros_like(key_mask, dtype=dtype)
+        additive.masked_fill_(~key_mask, float("-inf"))
+        additive = additive.repeat_interleave(self.heads, 0)
+        return additive
+    def _embeds(self, text):
+        cast_dtype = self.transformer.get_cast_dtype()
+        B, seq_len = text.shape
+        x = self.token_embedding(text).to(cast_dtype)
+        if self.cls_emb is not None:
+            x = torch.cat([x, _expand_token(self.cls_emb, x.size(0))], 1)
+            seq_len += 1
+        attn_mask = self.attn_mask
+        if self.use_pad_mask or self.cls_emb is not None:
+            add_mask = self._build_additive_mask(text, seq_len, x.dtype)
+            if attn_mask is not None:
+                attn_mask = attn_mask[:seq_len, :seq_len].unsqueeze(0) + add_mask
+            else:
+                attn_mask = add_mask
+        x = x + self.positional_embedding[:seq_len].to(cast_dtype)
+        return x, attn_mask
+    def forward(self, text):
+        x, attn_mask = self._embeds(text)
+        x = self.transformer(x, attn_mask=attn_mask)
+        if self.cls_emb is not None:
+            pooled = text_global_pool(x, pool_type="last")
+            pooled = self.ln_final(pooled)
+            tokens = x[:, :-1]
+        else:
+            x = self.ln_final(x)
+            pooled = text_global_pool(
+                x,
+                text,
+                pool_type=self.pool_type,
+                eos_token_id=getattr(self, "eos_id", None),
+            )
+            tokens = x
+        if self.text_projection is not None:
+            if isinstance(self.text_projection, nn.Linear):
+                pooled = self.text_projection(pooled)
+            else:
+                pooled = pooled @ self.text_projection
+        if self.output_tokens:
+            return pooled, tokens
+        return pooled

raon_vision_encoder/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Originally from OpenCLIP (https://github.com/mlfoundations/open_clip)
+import collections.abc
+from itertools import repeat
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)