liminerity
/

MoR-v1

Safetensors

MoR

Model card Files Files and versions

xet

Community

gate369 commited on Jul 20, 2025

Commit

9125ea0

verified ·

1 Parent(s): e33232e

Create README.md

Browse files

Files changed (1) hide show

README.md +338 -0

README.md ADDED Viewed

	@@ -0,0 +1,338 @@

+```python
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+import json
+# ----------------------
+# MoR Model Components
+# ----------------------
+class ExpertChoiceRouter(nn.Module):
+    """Expert Choice Routing: Experts select top-k tokens"""
+    def __init__(self, dim, num_experts, k=2):
+        super().__init__()
+        self.num_experts = num_experts
+        self.k = k
+        self.gate = nn.Linear(dim, num_experts, bias=False)
+    def forward(self, x):
+        # x: (batch, seq_len, dim)
+        scores = self.gate(x)  # (batch, seq_len, num_experts)
+        expert_weights, expert_indices = torch.topk(scores, self.k, dim=-1)
+        return expert_weights.softmax(dim=-1), expert_indices
+class Quantizer4Bit(nn.Module):
+    """4-bit Quantization Utilities"""
+    def __init__(self):
+        super().__init__()
+    @staticmethod
+    def quantize(tensor):
+        """Quantize tensor to 4-bit integers"""
+        scale = tensor.abs().max() / 7.5
+        scale = torch.clamp(scale, min=1e-8)
+        quantized = torch.clamp(torch.round(tensor / scale), -8, 7)
+        return quantized.to(torch.int8), scale
+    @staticmethod
+    def dequantize(quantized, scale):
+        """Dequantize 4-bit integers to float"""
+        return quantized.float() * scale
+class QuantizedRecursiveTransformerBlock(nn.Module):
+    """Recursive Transformer Block with Quantization"""
+    def __init__(self, dim, num_heads, ffn_expansion=4):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        # Attention layers
+        self.q_proj = nn.Linear(dim, dim)
+        self.k_proj = nn.Linear(dim, dim)
+        self.v_proj = nn.Linear(dim, dim)
+        self.attn_out = nn.Linear(dim, dim)
+        # FFN layers
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, ffn_expansion * dim),
+            nn.GELU(),
+            nn.Linear(ffn_expansion * dim, dim)
+        )
+        # Normalization
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+    def forward(self, x):
+        # x: (batch, seq_len, dim)
+        residual = x
+        x = self.norm1(x)
+        # Projections
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        # Quantize K and V
+        k_quant, k_scale = Quantizer4Bit.quantize(k)
+        v_quant, v_scale = Quantizer4Bit.quantize(v)
+        # Dequantize for computation
+        k = Quantizer4Bit.dequantize(k_quant, k_scale)
+        v = Quantizer4Bit.dequantize(v_quant, v_scale)
+        # Attention
+        B, T, _ = q.shape
+        q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
+        attn = attn.softmax(dim=-1)
+        attn_out = (attn @ v).transpose(1, 2).contiguous().view(B, T, self.dim)
+        attn_out = self.attn_out(attn_out)
+        # Residual connection
+        x = residual + attn_out
+        # FFN
+        x = x + self.ffn(self.norm2(x))
+        return x
+class RecursionDepthRouter(nn.Module):
+    """Lightweight Router for Dynamic Recursion Depth"""
+    def __init__(self, dim, max_depth=4):
+        super().__init__()
+        self.max_depth = max_depth
+        self.router = nn.Sequential(
+            nn.Linear(dim, 32),
+            nn.ReLU(),
+            nn.Linear(32, max_depth)
+        )
+    def forward(self, x):
+        # x: (batch, seq_len, dim)
+        router_logits = self.router(x.mean(dim=1))  # (batch, max_depth)
+        return router_logits.softmax(dim=-1)
+class QuantizedMoRModel(nn.Module):
+    """Main MoR Architecture"""
+    def __init__(self, vocab_size, dim, num_layers, num_heads, max_recursion, num_experts, max_position_embeddings):
+        super().__init__()
+        self.dim = dim
+        self.max_recursion = max_recursion
+        self.num_experts = num_experts
+        self.max_position_embeddings = max_position_embeddings
+        # Embedding layers
+        self.embedding = nn.Embedding(vocab_size, dim)
+        self.pos_embed = nn.Embedding(max_position_embeddings, dim)
+        # Initial unique layers
+        self.init_layers = nn.ModuleList([
+            QuantizedRecursiveTransformerBlock(dim, num_heads)
+            for _ in range(2)
+        ])
+        # Middle-cycle shared layers
+        self.cycle_depth = 3
+        self.recursive_blocks = nn.ModuleList([
+            QuantizedRecursiveTransformerBlock(dim, num_heads)
+            for _ in range(self.cycle_depth)
+        ])
+        # Recursion routers
+        self.recursion_routers = nn.ModuleList([
+            RecursionDepthRouter(dim, max_depth=max_recursion)
+            for _ in range(num_layers - 4)
+        ])
+        # Expert choice routing
+        self.expert_routers = nn.ModuleList([
+            ExpertChoiceRouter(dim, num_experts)
+            for _ in range(max_recursion)
+        ])
+        # Final unique layers
+        self.final_layers = nn.ModuleList([
+            QuantizedRecursiveTransformerBlock(dim, num_heads)
+            for _ in range(2)
+        ])
+        # Output head
+        self.ln_f = nn.LayerNorm(dim)
+        self.head = nn.Linear(dim, vocab_size, bias=False)
+    def forward(self, x):
+        # Embedding
+        pos = torch.arange(0, x.shape[1], device=x.device)
+        x = self.embedding(x) + self.pos_embed(pos)
+        # Initial unique layers
+        for layer in self.init_layers:
+            x = layer(x)
+        # Middle-cycle with recursion
+        all_x = [x]
+        batch_size, seq_len, _ = x.shape
+        for router in self.recursion_routers:
+            # Get recursion depth probabilities
+            depth_probs = router(x)
+            # Sample recursion depth
+            depth = torch.multinomial(depth_probs, 1).squeeze()
+            # Process through recursive blocks
+            for d in range(self.max_recursion):
+                # Expert routing
+                expert_weights, expert_indices = self.expert_routers[d](x)
+                # Create full weight matrix
+                full_weights = torch.zeros((batch_size, seq_len, self.num_experts),
+                                          device=x.device)
+                full_weights.scatter_(2, expert_indices, expert_weights)
+                # Process each expert
+                expert_outputs = []
+                for expert_idx in range(self.num_experts):
+                    # Get expert mask
+                    expert_mask = full_weights[:, :, expert_idx] > 0
+                    if expert_mask.any():
+                        # Create expert input
+                        expert_x = torch.zeros_like(x)
+                        expert_x[expert_mask] = x[expert_mask]
+                        # Process through block
+                        out = self.recursive_blocks[d % self.cycle_depth](expert_x)
+                        expert_outputs.append(out * full_weights[:, :, expert_idx].unsqueeze(-1))
+                    else:
+                        expert_outputs.append(torch.zeros_like(x))
+                # Combine expert outputs
+                x = sum(expert_outputs)
+            all_x.append(x)
+        # Combine outputs
+        x = torch.stack(all_x).mean(dim=0)
+        # Final unique layers
+        for layer in self.final_layers:
+            x = layer(x)
+        # Output
+        x = self.ln_f(x)
+        logits = self.head(x)
+        return logits
+    def generate(self, input_ids, max_length=100, temperature=0.8, top_k=50):
+        """Simple text generation function"""
+        device = next(self.parameters()).device
+        generated = input_ids.clone()
+        with torch.no_grad():
+            for _ in range(max_length):
+                # Use max_position_embeddings instead of SEQ_LEN
+                inputs = generated[:, -self.max_position_embeddings:] \
+                    if generated.shape[1] > self.max_position_embeddings \
+                    else generated
+                # Forward pass
+                logits = self(inputs)[:, -1, :]
+                # Apply temperature
+                logits = logits / temperature
+                # Top-k filtering
+                if top_k > 0:
+                    top_values, _ = torch.topk(logits, top_k)
+                    min_value = top_values[:, -1]
+                    logits[logits < min_value.unsqueeze(-1)] = -float('Inf')
+                # Sample next token
+                probs = torch.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, 1)
+                # Append to sequence
+                generated = torch.cat([generated, next_token], dim=-1)
+                # Break if EOS token
+                if next_token.item() == tokenizer.eos_token_id:
+                    break
+        return generated
+# ----------------------
+# Load Model from Hugging Face Hub (Updated)
+# ----------------------
+def load_model_from_hub(repo_id="liminerity/MoR-v1"):
+    # 1. Download config
+    config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    print("Model Config:", config)
+    # 2. Initialize model with config (including max_position_embeddings)
+    model = QuantizedMoRModel(
+        vocab_size=config["vocab_size"],
+        dim=config["dim"],
+        num_layers=config["num_layers"],
+        num_heads=config["num_heads"],
+        max_recursion=config["max_recursion"],
+        num_experts=config["num_experts"],
+        max_position_embeddings=config["max_position_embeddings"]
+    )
+    # 3. Download and load weights
+    weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors")
+    weights = load_file(weights_path)
+    model.load_state_dict(weights)
+    # 4. Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(repo_id)
+    return model, tokenizer
+# ----------------------
+# Run Inference
+# ----------------------
+def run_inference(model, tokenizer, prompt, max_length=100):
+    # Encode prompt
+    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
+    input_ids = inputs["input_ids"]
+    # Move to GPU if available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device).eval()
+    input_ids = input_ids.to(device)
+    # Generate text
+    output_ids = model.generate(input_ids, max_length=max_length)
+    # Decode and return
+    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+# ----------------------
+# Main Execution
+# ----------------------
+if __name__ == "__main__":
+    # Load model and tokenizer
+    print("Loading model from Hugging Face Hub...")
+    model, tokenizer = load_model_from_hub()
+    # Run inference
+    prompt = "The future of artificial intelligence"
+    print(f"\nPrompt: {prompt}")
+    generated_text = run_inference(model, tokenizer, prompt, max_length=100)
+    print("\nGenerated Text:")
+    print(generated_text)
+```