Upload VerMind model

Browse files

Files changed (11) hide show

chat_template.jinja +74 -0
config.json +231 -0
configuration_vermind.py +79 -0
configuration_vermind_v.py +32 -0
generation_config.json +6 -0
model.safetensors +3 -0
modeling_vermind.py +318 -0
modeling_vermind_v.py +200 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +43 -0

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,74 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0]['role'] == 'system' -%}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else -%}
+        {{- '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+   {{- '<|im_start|>' + message.role + '\n' + content }}
+  {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,231 @@

+{
+  "architectures": [
+    "VerMindVLM"
+  ],
+  "aux_loss_alpha": 0.01,
+  "bos_token_id": 1,
+  "dropout": 0.0,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "flash_attn": true,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "image_ids": [
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34,
+    34
+  ],
+  "image_special_token": "<image>",
+  "inference_rope_scaling": false,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 32768,
+  "model_type": "vermind-v",
+  "n_routed_experts": 4,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 8,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "scoring_func": "softmax",
+  "seq_aux": true,
+  "transformers_version": "4.57.6",
+  "use_moe": false,
+  "vocab_size": 6400
+}

configuration_vermind.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# coding=utf-8
+"""
+Configuration file for VerMind model - Standalone Version
+"""
+from transformers import PretrainedConfig, AutoConfig
+class VerMindConfig(PretrainedConfig):
+    """Configuration class for VerMind model"""
+    model_type = "vermind"
+    def __init__(
+        self,
+        dropout: float = 0.0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        hidden_act: str = 'silu',
+        hidden_size: int = 768,
+        intermediate_size: int = None,
+        max_position_embeddings: int = 32768,
+        num_attention_heads: int = 8,
+        num_hidden_layers: int = 16,
+        num_key_value_heads: int = 2,
+        vocab_size: int = 6400,
+        rms_norm_eps: float = 1e-05,
+        rope_theta: float = 1000000.0,
+        inference_rope_scaling: bool = False,
+        flash_attn: bool = True,
+        use_moe: bool = False,
+        num_experts_per_tok: int = 2,
+        n_routed_experts: int = 4,
+        n_shared_experts: int = 1,
+        scoring_func: str = 'softmax',
+        aux_loss_alpha: float = 0.01,
+        seq_aux: bool = True,
+        norm_topk_prob: bool = True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.dropout = dropout
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.num_key_value_heads = num_key_value_heads
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.inference_rope_scaling = inference_rope_scaling
+        self.rope_scaling = {
+            "beta_fast": 32,
+            "beta_slow": 1,
+            "factor": 16,
+            "original_max_position_embeddings": 2048,
+            "attention_factor": 1.0,
+            "type": "yarn"
+        } if self.inference_rope_scaling else None
+        self.flash_attn = flash_attn
+        self.use_moe = use_moe
+        self.num_experts_per_tok = num_experts_per_tok
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        self.norm_topk_prob = norm_topk_prob
+# Register the config class
+AutoConfig.register("vermind", VerMindConfig)
+__all__ = ["VerMindConfig"]

configuration_vermind_v.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# coding=utf-8
+"""
+Configuration file for VerMind-V model - Standalone Version
+"""
+from typing import List
+from transformers import AutoConfig
+from ..core.configuration_vermind import VerMindConfig
+class VLMConfig(VerMindConfig):
+    """Configuration class for VerMind-V (Vision-Language) model"""
+    model_type = "vermind-v"
+    def __init__(
+        self,
+        image_special_token: str = '<image>',
+        image_ids: List = None,
+        **kwargs,
+    ):
+        if image_ids is None:
+            image_ids = [34] * 196  # SigLIP 14x14 = 196 tokens, no pooling
+        self.image_special_token = image_special_token
+        self.image_ids = image_ids
+        super().__init__(**kwargs)
+# Register the config class
+AutoConfig.register("vermind-v", VLMConfig)
+__all__ = ["VLMConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.57.6"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c9a6c8ce7b30883286b8cda1a1f5c638f5cd970d5f780ae9ac15e309dfd06a5
+size 812097376

modeling_vermind.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# coding=utf-8
+"""
+Model file for VerMind model - Standalone Version
+Contains complete implementation without external dependencies
+"""
+import math
+from typing import Optional, Tuple, List, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, GenerationMixin, AutoModelForCausalLM
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_vermind import VerMindConfig
+# ==================== Base Module Functions ====================
+def precompute_freqs_cis(dim: int, end: int = int(32 * 1024), rope_base: float = 1e6,
+                         rope_scaling: Optional[dict] = None):
+    """Precompute rotary position embedding frequencies"""
+    freqs, attn_factor = 1.0 / (rope_base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)), 1.0
+    if rope_scaling is not None:
+        orig_max, factor, beta_fast, beta_slow, attn_factor = (
+            rope_scaling.get("original_max_position_embeddings", 2048),
+            rope_scaling.get("factor", 16),
+            rope_scaling.get("beta_fast", 32.0),
+            rope_scaling.get("beta_slow", 1.0),
+            rope_scaling.get("attention_factor", 1.0)
+        )
+        if end / orig_max > 1.0:
+            inv_dim = lambda b: (dim * math.log(orig_max / (b * 2 * math.pi))) / (2 * math.log(rope_base))
+            low, high = max(math.floor(inv_dim(beta_fast)), 0), min(math.ceil(inv_dim(beta_slow)), dim // 2 - 1)
+            ramp = torch.clamp((torch.arange(dim // 2, device=freqs.device).float() - low) / max(high - low, 0.001), 0, 1)
+            freqs = freqs * (1 - ramp + ramp / factor)
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cos = torch.cat([torch.cos(freqs), torch.cos(freqs)], dim=-1) * attn_factor
+    freqs_sin = torch.cat([torch.sin(freqs), torch.sin(freqs)], dim=-1) * attn_factor
+    return freqs_cos, freqs_sin
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Apply rotary position embeddings to queries and keys"""
+    def rotate_half(x):
+        return torch.cat((-x[..., x.shape[-1] // 2:], x[..., : x.shape[-1] // 2]), dim=-1)
+    if position_ids is not None:
+        if position_ids.dim() == 1:
+            pos_ids = position_ids
+            cos_selected = cos[pos_ids]
+            sin_selected = sin[pos_ids]
+            cos_selected = cos_selected.unsqueeze(0).unsqueeze(2)
+            sin_selected = sin_selected.unsqueeze(0).unsqueeze(2)
+        else:
+            cos_selected = cos[position_ids]
+            sin_selected = sin[position_ids]
+            cos_selected = cos_selected.unsqueeze(2)
+            sin_selected = sin_selected.unsqueeze(2)
+        q_embed = (q * cos_selected) + (rotate_half(q) * sin_selected)
+        k_embed = (k * cos_selected) + (rotate_half(k) * sin_selected)
+    else:
+        seq_len = q.shape[1]
+        cos_s = cos[:seq_len]
+        sin_s = sin[:seq_len]
+        cos_s = cos_s.unsqueeze(0).unsqueeze(2)
+        sin_s = sin_s.unsqueeze(0).unsqueeze(2)
+        q_embed = (q * cos_s) + (rotate_half(q) * sin_s)
+        k_embed = (k * cos_s) + (rotate_half(k) * sin_s)
+    return q_embed, k_embed
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Repeat key/value heads for GQA"""
+    bs, slen, num_key_value_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return x[:, :, :, None, :].expand(bs, slen, num_key_value_heads, n_rep, head_dim).reshape(
+        bs, slen, num_key_value_heads * n_rep, head_dim
+    )
+# ==================== Module Classes ====================
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization"""
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        return self.weight * self._norm(x.float()).type_as(x)
+class FeedForward(nn.Module):
+    """SwiGLU Feed-Forward Network"""
+    def __init__(self, config: VerMindConfig):
+        super().__init__()
+        if config.intermediate_size is None:
+            intermediate_size = int(config.hidden_size * 8 / 3)
+            config.intermediate_size = 64 * ((intermediate_size + 64 - 1) // 64)
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        return self.dropout(self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)))
+class Attention(nn.Module):
+    """Grouped Query Attention with RoPE"""
+    def __init__(self, args: VerMindConfig):
+        super().__init__()
+        self.num_key_value_heads = args.num_attention_heads if args.num_key_value_heads is None else args.num_key_value_heads
+        assert args.num_attention_heads % self.num_key_value_heads == 0
+        self.n_local_heads = args.num_attention_heads
+        self.n_local_kv_heads = self.num_key_value_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.hidden_size // args.num_attention_heads
+        self.q_proj = nn.Linear(args.hidden_size, args.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(args.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(args.num_attention_heads * self.head_dim, args.hidden_size, bias=False)
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') and args.flash_attn
+    def forward(self, x, position_embeddings, past_key_value=None, use_cache=False,
+                attention_mask=None, position_ids=None, cu_seqlens=None):
+        bsz, seq_len, _ = x.shape
+        xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        xq = xq.view(bsz, seq_len, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seq_len, self.n_local_kv_heads, self.head_dim)
+        cos, sin = position_embeddings
+        xq, xk = apply_rotary_pos_emb(xq, xk, cos, sin, position_ids=position_ids)
+        if past_key_value is not None:
+            xk = torch.cat([past_key_value[0], xk], dim=1)
+            xv = torch.cat([past_key_value[1], xv], dim=1)
+        past_kv = (xk, xv) if use_cache else None
+        xq, xk, xv = xq.transpose(1, 2), repeat_kv(xk, self.n_rep).transpose(1, 2), repeat_kv(xv, self.n_rep).transpose(1, 2)
+        is_2d_mask = attention_mask is not None and attention_mask.dim() == 3
+        use_flash = self.flash and (seq_len > 1) and (past_key_value is None)
+        if use_flash and (attention_mask is None or (not is_2d_mask and torch.all(attention_mask == 1))):
+            output = F.scaled_dot_product_attention(
+                xq, xk, xv,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=True
+            )
+        else:
+            scores = (xq @ xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
+            if not is_2d_mask:
+                scores[:, :, :, -seq_len:] += torch.triu(torch.full((seq_len, seq_len), float("-inf"), device=scores.device), diagonal=1)
+            if attention_mask is not None:
+                if is_2d_mask:
+                    attention_mask = attention_mask[:, 0, :] if attention_mask.dim() == 3 else attention_mask
+                extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+                extended_attention_mask = (1.0 - extended_attention_mask.float()) * -1e9
+                scores = scores + extended_attention_mask
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = scores @ xv
+        output = output.transpose(1, 2).reshape(bsz, seq_len, -1)
+        output = self.resid_dropout(self.o_proj(output))
+        return output, past_kv
+# ==================== Main Model Classes ====================
+class VerMindBlock(nn.Module):
+    """Transformer Decoder Block"""
+    def __init__(self, layer_id: int, config: VerMindConfig):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.self_attn = Attention(config)
+        self.layer_id = layer_id
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = FeedForward(config)
+    def forward(self, hidden_states, position_embeddings, past_key_value=None, use_cache=False,
+                attention_mask=None, position_ids=None, cu_seqlens=None):
+        residual = hidden_states
+        hidden_states, present_key_value = self.self_attn(
+            self.input_layernorm(hidden_states),
+            position_embeddings,
+            past_key_value,
+            use_cache,
+            attention_mask,
+            position_ids=position_ids,
+            cu_seqlens=cu_seqlens
+        )
+        hidden_states += residual
+        hidden_states = hidden_states + self.mlp(self.post_attention_layernorm(hidden_states))
+        return hidden_states, present_key_value
+class VerMindModel(nn.Module):
+    """VerMind Model (Transformer backbone)"""
+    def __init__(self, config: VerMindConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.dropout)
+        self.layers = nn.ModuleList([VerMindBlock(l, config) for l in range(self.num_hidden_layers)])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        freqs_cos, freqs_sin = precompute_freqs_cis(
+            dim=config.hidden_size // config.num_attention_heads,
+            end=config.max_position_embeddings,
+            rope_base=config.rope_theta,
+            rope_scaling=config.rope_scaling
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+    def forward(self, input_ids=None, attention_mask=None, past_key_values=None,
+                use_cache=False, position_ids=None, cu_seqlens=None, **kwargs):
+        if past_key_values is not None and hasattr(past_key_values, 'layers'):
+            past_key_values = None
+        past_key_values = past_key_values or [None] * len(self.layers)
+        start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
+        hidden_states = self.dropout(self.embed_tokens(input_ids))
+        position_embeddings = (self.freqs_cos, self.freqs_sin)
+        presents = []
+        for layer_idx, (layer, past_key_value) in enumerate(zip(self.layers, past_key_values)):
+            hidden_states, present = layer(
+                hidden_states,
+                position_embeddings,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                cu_seqlens=cu_seqlens
+            )
+            presents.append(present)
+        hidden_states = self.norm(hidden_states)
+        aux_loss = 0
+        return hidden_states, presents, aux_loss
+class VerMindForCausalLM(PreTrainedModel, GenerationMixin):
+    """VerMind Causal Language Model"""
+    config_class = VerMindConfig
+    def __init__(self, config: VerMindConfig = None):
+        self.config = config or VerMindConfig()
+        super().__init__(self.config)
+        self.model = VerMindModel(self.config)
+        self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
+        self.model.embed_tokens.weight = self.lm_head.weight
+    def forward(self, input_ids=None, attention_mask=None, labels=None,
+                past_key_values=None, use_cache=False, logits_to_keep=0,
+                position_ids=None, cu_seqlens=None, **args):
+        hidden_states, past_key_values, aux_loss = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            position_ids=position_ids,
+            cu_seqlens=cu_seqlens,
+            **args
+        )
+        is_varlen = cu_seqlens is not None
+        if is_varlen:
+            logits = self.lm_head(hidden_states)
+        else:
+            slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+            logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            if is_varlen:
+                shift_logits = logits[:-1, :].contiguous()
+                shift_labels = labels[1:].contiguous()
+                loss = F.cross_entropy(shift_logits, shift_labels, ignore_index=-100)
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=-100)
+        output = CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=past_key_values, hidden_states=hidden_states)
+        output.aux_loss = aux_loss
+        return output
+# Register the model class
+AutoModelForCausalLM.register(VerMindForCausalLM.config_class, VerMindForCausalLM)
+__all__ = ["VerMindForCausalLM", "VerMindModel", "VerMindBlock", "Attention", "FeedForward", "RMSNorm"]

modeling_vermind_v.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# coding=utf-8
+"""
+Model file for VerMind-V (VLM) model - Standalone Version
+Contains complete VLM implementation without external dependencies
+"""
+import os
+import warnings
+from typing import Optional, Tuple, List, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ..core.modeling_vermind import VerMindForCausalLM, VerMindModel, RMSNorm, precompute_freqs_cis, apply_rotary_pos_emb, repeat_kv
+from .configuration_vermind_v import VLMConfig
+warnings.filterwarnings('ignore')
+class VisionProj(nn.Module):
+    """Vision Projection Layer - Projects vision features to language model space"""
+    def __init__(self, ve_hidden_size=768, hidden_size=512):
+        super().__init__()
+        self.ve_hidden_size = ve_hidden_size
+        self.hidden_size = hidden_size
+        intermediate_size = min(ve_hidden_size, hidden_size)
+        self.proj = nn.Sequential(
+            nn.LayerNorm(ve_hidden_size),
+            nn.Linear(ve_hidden_size, intermediate_size),
+            nn.GELU(),
+            nn.Linear(intermediate_size, hidden_size)
+        )
+    def forward(self, image_encoders):
+        return self.proj(image_encoders)
+class VerMindVLM(VerMindForCausalLM):
+    """VerMind Vision-Language Model"""
+    config_class = VLMConfig
+    def __init__(self, params: VLMConfig = None, vision_model_path="google/siglip-base-patch16-224"):
+        # Initialize using parent init but with our own model structure
+        self.params = params or VLMConfig()
+        # Call PreTrainedModel init directly to avoid double initialization
+        nn.Module.__init__(self)
+        self.config = self.params
+        # Build the model components
+        self.model = VerMindVLMModel(self.params)
+        self.lm_head = nn.Linear(self.params.hidden_size, self.params.vocab_size, bias=False)
+        self.model.embed_tokens.weight = self.lm_head.weight
+        # Vision components
+        self.vision_encoder, self.processor = self.__class__.get_vision_model(vision_model_path)
+        self.vision_proj = VisionProj(ve_hidden_size=768, hidden_size=params.hidden_size)
+    @staticmethod
+    def get_vision_model(model_path: str):
+        """Load vision encoder (SigLIP)"""
+        from transformers import logging as hf_logging
+        from transformers import SiglipVisionModel, SiglipProcessor
+        hf_logging.set_verbosity_error()
+        if not os.path.exists(model_path) and "/" not in model_path:
+            return None, None
+        print(f"[VerMind-V] Loading Vision Encoder: {model_path}...")
+        try:
+            vision_model = SiglipVisionModel.from_pretrained(model_path)
+            processor = SiglipProcessor.from_pretrained(model_path)
+        except Exception as e:
+            print(f"Error loading SigLIP Vision: {e}")
+            return None, None
+        for param in vision_model.parameters():
+            param.requires_grad = False
+        return vision_model.eval(), processor
+    @staticmethod
+    def image2tensor(image, processor):
+        """Convert PIL image to tensor"""
+        if image.mode in ['RGBA', 'LA']:
+            image = image.convert('RGB')
+        inputs = processor(images=image, return_tensors="pt")['pixel_values']
+        return inputs
+    @staticmethod
+    def get_image_embeddings(image_tensors, vision_model):
+        """Extract image features from vision encoder"""
+        outputs = vision_model(pixel_values=image_tensors)
+        return outputs.last_hidden_state
+    def count_vision_proj(self, tokens, h, vision_tensors=None, seqlen=512):
+        """Insert vision projections into hidden states at image token positions"""
+        def find_indices(tokens, image_ids):
+            image_ids_tensor = torch.tensor(image_ids).to(tokens.device)
+            len_image_ids = len(image_ids)
+            if len_image_ids > tokens.size(1):
+                return None
+            tokens_view = tokens.unfold(1, len_image_ids, 1)
+            matches = (tokens_view == image_ids_tensor).all(dim=2)
+            return {
+                batch_idx: [(idx.item(), idx.item() + len_image_ids - 1) for idx in
+                           matches[batch_idx].nonzero(as_tuple=True)[0]]
+                for batch_idx in range(tokens.size(0)) if matches[batch_idx].any()
+            } or None
+        image_indices = find_indices(tokens, self.params.image_ids)
+        if vision_tensors is not None and image_indices:
+            vision_proj = self.vision_proj(vision_tensors)
+            if len(vision_proj.shape) == 3:
+                vision_proj = vision_proj.unsqueeze(0)
+            new_h = []
+            for i in range(h.size(0)):
+                if i in image_indices:
+                    h_i = h[i]
+                    img_idx = 0
+                    for start_idx, end_idx in image_indices[i]:
+                        if vision_proj.dim() == 4:
+                            current_vision_embeds = vision_proj[0, i]
+                        else:
+                            current_vision_embeds = vision_proj[i]
+                        if img_idx < 1:
+                            h_i = torch.cat((h_i[:start_idx], current_vision_embeds, h_i[end_idx + 1:]), dim=0)[:seqlen]
+                        img_idx += 1
+                    new_h.append(h_i)
+                else:
+                    new_h.append(h[i])
+            return torch.stack(new_h, dim=0)
+        return h
+    def forward(self, input_ids=None, attention_mask=None, labels=None,
+                past_key_values=None, use_cache=False, logits_to_keep=0,
+                pixel_values=None, **args):
+        batch_size, seq_length = input_ids.shape
+        if hasattr(past_key_values, 'layers'):
+            past_key_values = None
+        past_key_values = past_key_values or [None] * len(self.model.layers)
+        start_pos = past_key_values[0][0].shape[1] if past_key_values[0] is not None else 0
+        hidden_states = self.model.dropout(self.model.embed_tokens(input_ids))
+        if pixel_values is not None and start_pos == 0:
+            if len(pixel_values.shape) == 5:
+                pixel_values = pixel_values[:, 0, :, :, :]
+            vision_tensors = VerMindVLM.get_image_embeddings(pixel_values, self.vision_encoder)
+            hidden_states = self.count_vision_proj(
+                tokens=input_ids,
+                h=hidden_states,
+                vision_tensors=vision_tensors,
+                seqlen=input_ids.shape[1]
+            )
+        position_embeddings = (
+            self.model.freqs_cos[start_pos:start_pos + seq_length],
+            self.model.freqs_sin[start_pos:start_pos + seq_length]
+        )
+        presents = []
+        for layer_idx, (layer, past_key_value) in enumerate(zip(self.model.layers, past_key_values)):
+            hidden_states, present = layer(
+                hidden_states,
+                position_embeddings,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                attention_mask=attention_mask
+            )
+            presents.append(present)
+        hidden_states = self.model.norm(hidden_states)
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=-100)
+        output = CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=presents, hidden_states=hidden_states)
+        return output
+class VerMindVLMModel(VerMindModel):
+    """VerMind-V Model (extends VerMindModel for VLM)"""
+    pass  # Inherits everything from VerMindModel
+# Register the model class
+AutoModelForCausalLM.register(VerMindVLM.config_class, VerMindVLM)
+__all__ = ["VerMindVLM", "VisionProj", "VerMindVLMModel"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|im_start|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<|im_start|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<|endoftext|>"
+}