Instructions to use MK0727/lambda-160m with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use MK0727/lambda-160m with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="MK0727/lambda-160m", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("MK0727/lambda-160m", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use MK0727/lambda-160m with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "MK0727/lambda-160m" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/MK0727/lambda-160m
- SGLang
How to use MK0727/lambda-160m with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "MK0727/lambda-160m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "MK0727/lambda-160m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use MK0727/lambda-160m with Docker Model Runner:
docker model run hf.co/MK0727/lambda-160m
File size: 5,504 Bytes
134df9b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | import torch
import torch.nn as nn
import torch.nn.functional as F
from src.pretraining.kv_cache import LayerKeyValueCache
class Attention(nn.Module):
def __init__(self, d_model: int = 2, num_heads: int = 1) -> None:
super().__init__()
# ---------------------------------------------------------
# Split the model dimension into multiple heads so the same
# attention module can be reused in a more general structure.
# ---------------------------------------------------------
if d_model % num_heads != 0:
raise ValueError("d_model must be divisible by num_heads")
self.d_model = d_model
self.num_heads = num_heads
self.head_dim = d_model // num_heads
# ---------------------------------------------------------
# Project inputs into query, key, and value spaces and merge
# the heads back into the model dimension after attention.
# ---------------------------------------------------------
self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
self.W_o = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
# ---------------------------------------------------------
# Rearrange the last dimension into head count and head size
# so attention can be computed independently per head.
# ---------------------------------------------------------
batch_size, seq_len, _ = x.size()
reshaped = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
return reshaped.transpose(1, 2)
def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
# ---------------------------------------------------------
# Restore the tensor to the original model dimension after
# per-head attention has been combined.
# ---------------------------------------------------------
batch_size, _, seq_len, _ = x.size()
transposed = x.transpose(1, 2).contiguous()
return transposed.view(batch_size, seq_len, self.d_model)
def forward(
self,
encoding_for_q: torch.Tensor,
encoding_for_k: torch.Tensor,
encoding_for_v: torch.Tensor,
is_causal: bool = False,
) -> torch.Tensor:
# ---------------------------------------------------------
# Create the projected queries, keys, and values for each
# attention head from the incoming hidden states.
# ---------------------------------------------------------
q = self._split_heads(self.W_q(encoding_for_q))
k = self._split_heads(self.W_k(encoding_for_k))
v = self._split_heads(self.W_v(encoding_for_v))
# ---------------------------------------------------------
# Use PyTorch's fused scaled dot-product attention so large
# score and softmax tensors do not need to be materialized.
# ---------------------------------------------------------
attention_scores = F.scaled_dot_product_attention(
q,
k,
v,
is_causal=is_causal,
)
# ---------------------------------------------------------
# Merge the attended heads and project the result back into
# the model dimension for the next layer.
# ---------------------------------------------------------
merged_scores = self._merge_heads(attention_scores)
return self.W_o(merged_scores)
def forward_with_cache(
self,
encoding_for_q: torch.Tensor,
encoding_for_k: torch.Tensor,
encoding_for_v: torch.Tensor,
past_key_value: LayerKeyValueCache | None,
is_causal: bool = False,
) -> tuple[torch.Tensor, LayerKeyValueCache]:
# ---------------------------------------------------------
# Project the current tokens and append previous keys and
# values so generation can avoid recomputing old states.
# ---------------------------------------------------------
q = self._split_heads(self.W_q(encoding_for_q))
current_k = self._split_heads(self.W_k(encoding_for_k))
current_v = self._split_heads(self.W_v(encoding_for_v))
k = current_k
v = current_v
if past_key_value is not None:
past_k, past_v = past_key_value
k = torch.cat((past_k, current_k), dim=2)
v = torch.cat((past_v, current_v), dim=2)
# ---------------------------------------------------------
# Attend the current query positions over cached and current
# keys with the fused scaled dot-product implementation.
# ---------------------------------------------------------
attention_scores = F.scaled_dot_product_attention(
q,
k,
v,
is_causal=is_causal,
)
# ---------------------------------------------------------
# Return both the attention result and the updated cache for
# this layer so the caller can feed the next token directly.
# ---------------------------------------------------------
merged_scores = self._merge_heads(attention_scores)
return self.W_o(merged_scores), (k, v)
|