Instructions to use Xuezha/RecombinationTransformer-base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Xuezha/RecombinationTransformer-base with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Xuezha/RecombinationTransformer-base", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Xuezha/RecombinationTransformer-base", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use Xuezha/RecombinationTransformer-base with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Xuezha/RecombinationTransformer-base"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Xuezha/RecombinationTransformer-base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Xuezha/RecombinationTransformer-base

SGLang

How to use Xuezha/RecombinationTransformer-base with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Xuezha/RecombinationTransformer-base" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Xuezha/RecombinationTransformer-base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Xuezha/RecombinationTransformer-base" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Xuezha/RecombinationTransformer-base",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Xuezha/RecombinationTransformer-base with Docker Model Runner:
```
docker model run hf.co/Xuezha/RecombinationTransformer-base
```

Xuezha commited on Jun 7, 2024

Commit

aacfb7d

verified ·

1 Parent(s): aff41a7

Create modeling.py

Browse files

Files changed (1) hide show

modeling.py +178 -0

modeling.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+class MaskedSelfAttentionLayer(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(MaskedSelfAttentionLayer, self).__init__()
+        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+    def forward(self, q, k, v, attn_mask=None):
+        attn_output, _ = self.multihead_attn(q, k, v, attn_mask=attn_mask)
+        return attn_output
+class FcLayer(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(FcLayer, self).__init__()
+        self.fc = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        return self.fc(x)
+class SwishGLU(nn.Module):
+    def __init__(self, input_dim):
+        super(SwishGLU, self).__init__()
+        self.fc1 = nn.Linear(input_dim, input_dim)
+        self.fc2 = nn.Linear(input_dim, input_dim)
+    def forward(self, x):
+        return torch.sigmoid(self.fc1(x)) * self.fc2(x)
+class SpecialLayerF(nn.Module):
+    def __init__(self, input_dim):
+        super(SpecialLayerF, self).__init__()
+        self.proj_up = nn.Linear(input_dim, input_dim)
+        self.proj_gate = SwishGLU(input_dim)
+    def forward(self, o2, o3):
+        cross_product = o2 * o3
+        proj_up_output = self.proj_up(cross_product)
+        proj_gate_output = self.proj_gate(cross_product)
+        return proj_up_output * proj_gate_output
+class RMSNorm(nn.Module):
+    def __init__(self, embed_dim, eps=1e-8):
+        super(RMSNorm, self).__init__()
+        self.embed_dim = embed_dim
+        self.eps = eps
+        self.scale = nn.Parameter(torch.ones(embed_dim))
+    def forward(self, x):
+        norm = x.norm(2, dim=-1, keepdim=True)
+        rms_norm = x / (norm + self.eps)
+        return self.scale * rms_norm
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super(MLP, self).__init__()
+        self.up_proj = nn.Linear(input_dim, hidden_dim)
+        self.gate_proj = nn.Linear(input_dim, hidden_dim)
+        self.act = SwishGLU(hidden_dim)
+        self.down_proj = nn.Linear(hidden_dim, input_dim)
+    def forward(self, x):
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
+class RecombinationTransformerLayer(nn.Module):
+    def __init__(self, embed_dim, num_heads):
+        super(RecombinationTransformerLayer, self).__init__()
+        self.num_heads = num_heads
+        # First self-attention layer
+        self.self_attention_1 = MaskedSelfAttentionLayer(embed_dim, num_heads)
+        self.fc_q = FcLayer(embed_dim, embed_dim)
+        self.fc_k = FcLayer(embed_dim, embed_dim)
+        self.fc_v = FcLayer(embed_dim, embed_dim)
+        # Second self-attention layer
+        self.self_attention_2 = MaskedSelfAttentionLayer(embed_dim, num_heads)
+        self.fc_qc = FcLayer(embed_dim, embed_dim)
+        self.fc_kb = FcLayer(embed_dim, embed_dim)
+        self.fc_vb = FcLayer(embed_dim, embed_dim)
+        # Third self-attention layer
+        self.self_attention_3 = MaskedSelfAttentionLayer(embed_dim, num_heads)
+        # Special layer F
+        self.special_layer_f = SpecialLayerF(embed_dim)
+        # MLP layer
+        self.mlp = MLP(embed_dim, embed_dim * 4)
+        self.rms_norm1 = RMSNorm(embed_dim)
+        self.rms_norm2 = RMSNorm(embed_dim)
+    def forward(self, x, attn_mask=None):
+        batch_size, seq_length, _ = x.size()
+        if attn_mask is not None:
+            # Reshape the attention mask to (batch_size * num_heads, seq_length, seq_length)
+            attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1).view(batch_size * self.num_heads, seq_length, seq_length)
+        # First self-attention block
+        q1 = self.fc_q(x).transpose(0, 1)
+        k1 = self.fc_k(x).transpose(0, 1)
+        v1 = self.fc_v(x).transpose(0, 1)
+        o1 = self.self_attention_1(q1, k1, v1, attn_mask=attn_mask).transpose(0, 1)
+        # Second self-attention block
+        q2 = q1
+        k2 = self.fc_kb(o1).transpose(0, 1)
+        v2 = self.fc_vb(o1).transpose(0, 1)
+        o2 = self.self_attention_2(q2, k2, v2, attn_mask=attn_mask).transpose(0, 1)
+        # Third self-attention block
+        q3 = self.fc_qc(o1).transpose(0, 1)
+        k3 = k1
+        v3 = v1
+        o3 = self.self_attention_3(q3, k3, v3, attn_mask=attn_mask).transpose(0, 1)
+        # Special layer F
+        output_f = self.special_layer_f(o2, o3) * o1
+        # Add & Norm
+        x = x + output_f
+        x = self.rms_norm1(x)
+        # MLP block
+        mlp_output = self.mlp(x)
+        # Add & Norm
+        x = x + mlp_output
+        x = self.rms_norm2(x)
+        return x
+class RecombinationTransformerConfig(PretrainedConfig):
+    model_type = "RecombinationTransformer"
+    def __init__(self, embed_dim=1024, num_heads=8, num_layers=6, vocab_size=151643, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+class RecombinationTransformerForCausalLM(PreTrainedModel):
+    config_class = RecombinationTransformerConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.embed_dim)
+        self.layers = nn.ModuleList([
+            RecombinationTransformerLayer(config.embed_dim, config.num_heads) for _ in range(config.num_layers)
+        ])
+        self.final_rms_norm = RMSNorm(config.embed_dim)
+        self.lm_head = nn.Linear(config.embed_dim, config.vocab_size, bias=False)
+    def forward(self, input_ids, attention_mask=None, past_key_values=None):
+        if attention_mask is None:
+            attention_mask = torch.ones(input_ids.shape, device=input_ids.device)
+        # Create causal mask
+        batch_size, seq_length = input_ids.size()
+        causal_mask = torch.tril(torch.ones((seq_length, seq_length), device=input_ids.device)).unsqueeze(0).expand(batch_size, -1, -1)
+        # Embedding
+        x = self.embed_tokens(input_ids)
+        # Apply layers
+        for layer in self.layers:
+            x = layer(x, attn_mask=causal_mask)
+        # Final normalization
+        x = self.final_rms_norm(x)
+        # LM head
+        logits = self.lm_head(x)
+        return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)