Spaces:

Chunjiang-Intelligence
/

Socrates-Modern-Embeddings

Sleeping

App Files Files Community

imbue2025 commited on Jan 11

Commit

d3d5ea7

verified ·

1 Parent(s): a12b533

Create model_arch.py

Browse files

Files changed (1) hide show

model_arch.py +303 -0

model_arch.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from typing import Optional, Tuple, List, Dict
+import math
+@dataclass
+class EmbeddingConfig:
+    """Configuration for embedding models"""
+    vocab_size: int
+    hidden_size: int = 384
+    n_layer: int = 6
+    n_head: int = 6
+    n_kv_head: int = 2
+    intermediate_size: int = 1024
+    max_seq_len: int = 512
+    dropout: float = 0.1
+    rms_norm_eps: float = 1e-6
+    use_cache: bool = False
+    # Embedding-specific
+    embedding_dim: int = 384  # Output embedding dimension
+    pooling_method: str = "mean"  # "mean", "cls", "attention"
+    normalize_embeddings: bool = True
+    use_temperature_scaling: bool = True
+    temperature: float = 0.05
+class RoPE(nn.Module):
+    """Rotary Position Embedding"""
+    def __init__(self, dim: int, base: int = 10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def _rotate_half(self, vector):
+        vector1 = vector[..., :vector.shape[-1] // 2]
+        vector2 = vector[..., vector.shape[-1] // 2:]
+        return torch.cat((-vector2, vector1), dim=-1)
+    def forward(self, q, k, position_ids):
+        inv_freq = self.inv_freq.to(dtype=q.dtype)
+        freqs = (position_ids.unsqueeze(-1) * inv_freq.unsqueeze(0)).to(q.device)
+        emb = torch.cat([freqs, freqs], dim=-1)
+        cos = emb.cos().unsqueeze(1).to(q.dtype)
+        sin = emb.sin().unsqueeze(1).to(q.dtype)
+        q_rot = (q * cos) + (self._rotate_half(q) * sin)
+        k_rot = (k * cos) + (self._rotate_half(k) * sin)
+        return q_rot, k_rot
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization"""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        var = torch.mean(x ** 2, dim=-1, keepdim=True)
+        x_normed = x * torch.rsqrt(var + self.eps)
+        return self.weight * x_normed
+class SwiGLU(nn.Module):
+    """Gated Linear Unit with SiLU activation"""
+    def __init__(self, dim, hidden_dim):
+        super().__init__()
+        self.gate_up_proj = nn.Linear(dim, 2 * hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
+    def forward(self, x):
+        fused_output = self.gate_up_proj(x)
+        gate_output, up_output = fused_output.chunk(2, dim=-1)
+        return self.down_proj(F.silu(gate_output) * up_output)
+class GroupedQueryAttention(nn.Module):
+    """Grouped Query Attention mechanism"""
+    def __init__(self, config: EmbeddingConfig):
+        super().__init__()
+        self.num_heads = config.n_head
+        self.num_kv_heads = config.n_kv_head
+        # Validate head configuration
+        if self.num_heads <= 0:
+            raise ValueError(f"n_head must be > 0, got {self.num_heads}")
+        if self.num_kv_heads <= 0:
+            raise ValueError(f"n_kv_head must be > 0, got {self.num_kv_heads}")
+        if self.num_heads % self.num_kv_heads != 0:
+            raise ValueError(
+                f"n_head ({self.num_heads}) must be divisible by n_kv_head ({self.num_kv_heads})"
+            )
+        if config.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size ({config.hidden_size}) must be divisible by n_head ({self.num_heads}). "
+                "Choose hidden_size that is multiple of n_head or set n_head accordingly."
+            )
+        self.num_head_groups = self.num_heads // self.num_kv_heads
+        self.head_dim = config.hidden_size // self.num_heads
+        self.dropout = config.dropout
+        self.q_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.rotary_emb = RoPE(self.head_dim)
+        self.q_norm = RMSNorm(self.head_dim)
+        self.k_norm = RMSNorm(self.head_dim)
+    def forward(self, x, position_ids, attention_mask=None):
+        B, L, D = x.shape
+        q = self.q_proj(x).reshape(B, L, self.num_heads, self.head_dim)
+        k = self.k_proj(x).reshape(B, L, self.num_kv_heads, self.head_dim)
+        v = self.v_proj(x).reshape(B, L, self.num_kv_heads, self.head_dim)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        q, k = self.rotary_emb(q, k, position_ids)
+        # Expand KV heads for grouped attention
+        if self.num_head_groups > 1:
+            k = k.unsqueeze(2).expand(-1, -1, self.num_head_groups, -1, -1).reshape(B, self.num_heads, -1, self.head_dim)
+            v = v.unsqueeze(2).expand(-1, -1, self.num_head_groups, -1, -1).reshape(B, self.num_heads, -1, self.head_dim)
+        # Compute attention scores
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            scores = scores.masked_fill(attention_mask[:, None, None, :] == 0, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        out = torch.matmul(attn_weights, v)
+        out = out.transpose(1, 2).contiguous().reshape(B, L, D)
+        out = self.o_proj(out)
+        return out
+class EmbeddingTransformerLayer(nn.Module):
+    """Single Transformer layer for embedding model"""
+    def __init__(self, config: EmbeddingConfig):
+        super().__init__()
+        self.attention = GroupedQueryAttention(config)
+        self.mlp = SwiGLU(config.hidden_size, config.intermediate_size)
+        self.norm1 = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.norm2 = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x, position_ids, attention_mask=None):
+        # Pre-norm architecture
+        normed_x = self.norm1(x)
+        attn_out = self.attention(normed_x, position_ids, attention_mask)
+        x = x + self.dropout(attn_out)
+        normed_x = self.norm2(x)
+        mlp_out = self.mlp(normed_x)
+        x = x + self.dropout(mlp_out)
+        return x
+class EmbeddingEncoder(nn.Module):
+    """Transformer-based encoder for generating embeddings"""
+    def __init__(self, config: EmbeddingConfig):
+        super().__init__()
+        self.config = config
+        self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_ids = torch.arange(config.max_seq_len, dtype=torch.long).unsqueeze(0)
+        self.layers = nn.ModuleList([
+            EmbeddingTransformerLayer(config) for _ in range(config.n_layer)
+        ])
+        self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.embedding_proj = None
+        if config.embedding_dim != config.hidden_size:
+            self.embedding_proj = nn.Linear(config.hidden_size, config.embedding_dim, bias=False)
+    def forward(self, input_ids, attention_mask=None):
+        """
+        Args:
+            input_ids: (batch_size, seq_len)
+            attention_mask: (batch_size, seq_len) - 1 for valid tokens, 0 for padding
+        Returns:
+            embeddings: (batch_size, embedding_dim)
+            hidden_states: (batch_size, seq_len, hidden_size)
+        """
+        B, L = input_ids.shape
+        device = input_ids.device
+        # Token embedding
+        # Sanity check: ensure token ids are within embedding range to avoid CUDA OOB
+        if input_ids.numel() > 0:
+            max_id = int(input_ids.max().item())
+            min_id = int(input_ids.min().item())
+            vocab_size = self.token_embedding.num_embeddings
+            if min_id < 0 or max_id >= vocab_size:
+                raise ValueError(
+                    f"Input token id out of range: found ids in [{min_id}, {max_id}] but "
+                    f"embedding vocab_size={vocab_size}. Ensure tokenizer and model vocab sizes match."
+                )
+        x = self.token_embedding(input_ids)
+        # Position IDs - ensure buffer is long enough
+        if L > self.position_ids.size(1):
+            # Extend position IDs if needed
+            new_position_ids = torch.arange(L, dtype=torch.long, device=device).unsqueeze(0)
+            position_ids = new_position_ids
+        else:
+            position_ids = self.position_ids[:, :L].to(device)
+        # Create attention mask if not provided
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        # Transformer layers
+        for layer in self.layers:
+            x = layer(x, position_ids, attention_mask)
+        # Final normalization
+        hidden_states = self.norm(x)
+        # Pooling
+        if self.config.pooling_method == "mean":
+            # Mean pooling with masking
+            mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.shape).float()
+            sum_embeddings = (hidden_states * mask_expanded).sum(1)
+            sum_mask = mask_expanded.sum(1).clamp(min=1e-9)
+            embeddings = sum_embeddings / sum_mask
+        elif self.config.pooling_method == "cls":
+            # Use CLS token (first token)
+            embeddings = hidden_states[:, 0, :]
+        elif self.config.pooling_method == "attention":
+            # Attention-weighted pooling
+            attn_weights = F.softmax(
+                torch.ones(1, L, device=device) * attention_mask.float().unsqueeze(0),
+                dim=-1
+            )
+            embeddings = torch.matmul(attn_weights, hidden_states).squeeze(1)
+        else:
+            raise ValueError(f"Unknown pooling method: {self.config.pooling_method}")
+        # Projection to embedding dimension
+        if self.embedding_proj is not None:
+            embeddings = self.embedding_proj(embeddings)
+        # Normalize embeddings
+        if self.config.normalize_embeddings:
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings, hidden_states
+class DualEmbeddingModel(nn.Module):
+    """Dual-encoder architecture for symmetric similarity learning"""
+    def __init__(self, config: EmbeddingConfig):
+        super().__init__()
+        self.config = config
+        self.encoder = EmbeddingEncoder(config)
+        if config.use_temperature_scaling:
+            self.logit_scale = nn.Parameter(torch.ones([]) * math.log(1 / config.temperature))
+        else:
+            self.logit_scale = None
+    def forward(self, input_ids_1, input_ids_2=None, attention_mask_1=None, attention_mask_2=None):
+        """
+        Args:
+            input_ids_1: (batch_size, seq_len)
+            input_ids_2: (batch_size, seq_len) - if None, returns only embeddings for input_ids_1
+            attention_mask_1: (batch_size, seq_len)
+            attention_mask_2: (batch_size, seq_len)
+        Returns:
+            embeddings_1: (batch_size, embedding_dim)
+            embeddings_2: (batch_size, embedding_dim) or None
+        """
+        embeddings_1, _ = self.encoder(input_ids_1, attention_mask_1)
+        if input_ids_2 is not None:
+            embeddings_2, _ = self.encoder(input_ids_2, attention_mask_2)
+            return embeddings_1, embeddings_2
+        return embeddings_1, None
+    def compute_similarity(self, embeddings_1, embeddings_2):
+        """Compute cosine similarity between embeddings"""
+        # embeddings should already be normalized if normalize_embeddings=True
+        similarity = torch.matmul(embeddings_1, embeddings_2.t())
+        if self.logit_scale is not None:
+            logit_scale = self.logit_scale.exp()
+            similarity = similarity * logit_scale
+        return similarity