Instructions to use MK0727/lambda-160m with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use MK0727/lambda-160m with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="MK0727/lambda-160m", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("MK0727/lambda-160m", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use MK0727/lambda-160m with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "MK0727/lambda-160m" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/MK0727/lambda-160m
- SGLang
How to use MK0727/lambda-160m with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "MK0727/lambda-160m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "MK0727/lambda-160m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use MK0727/lambda-160m with Docker Model Runner:
docker model run hf.co/MK0727/lambda-160m
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from src.pretraining.kv_cache import LayerKeyValueCache | |
| class Attention(nn.Module): | |
| def __init__(self, d_model: int = 2, num_heads: int = 1) -> None: | |
| super().__init__() | |
| # --------------------------------------------------------- | |
| # Split the model dimension into multiple heads so the same | |
| # attention module can be reused in a more general structure. | |
| # --------------------------------------------------------- | |
| if d_model % num_heads != 0: | |
| raise ValueError("d_model must be divisible by num_heads") | |
| self.d_model = d_model | |
| self.num_heads = num_heads | |
| self.head_dim = d_model // num_heads | |
| # --------------------------------------------------------- | |
| # Project inputs into query, key, and value spaces and merge | |
| # the heads back into the model dimension after attention. | |
| # --------------------------------------------------------- | |
| self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False) | |
| self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False) | |
| self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False) | |
| self.W_o = nn.Linear(in_features=d_model, out_features=d_model, bias=False) | |
| def _split_heads(self, x: torch.Tensor) -> torch.Tensor: | |
| # --------------------------------------------------------- | |
| # Rearrange the last dimension into head count and head size | |
| # so attention can be computed independently per head. | |
| # --------------------------------------------------------- | |
| batch_size, seq_len, _ = x.size() | |
| reshaped = x.view(batch_size, seq_len, self.num_heads, self.head_dim) | |
| return reshaped.transpose(1, 2) | |
| def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: | |
| # --------------------------------------------------------- | |
| # Restore the tensor to the original model dimension after | |
| # per-head attention has been combined. | |
| # --------------------------------------------------------- | |
| batch_size, _, seq_len, _ = x.size() | |
| transposed = x.transpose(1, 2).contiguous() | |
| return transposed.view(batch_size, seq_len, self.d_model) | |
| def forward( | |
| self, | |
| encoding_for_q: torch.Tensor, | |
| encoding_for_k: torch.Tensor, | |
| encoding_for_v: torch.Tensor, | |
| is_causal: bool = False, | |
| ) -> torch.Tensor: | |
| # --------------------------------------------------------- | |
| # Create the projected queries, keys, and values for each | |
| # attention head from the incoming hidden states. | |
| # --------------------------------------------------------- | |
| q = self._split_heads(self.W_q(encoding_for_q)) | |
| k = self._split_heads(self.W_k(encoding_for_k)) | |
| v = self._split_heads(self.W_v(encoding_for_v)) | |
| # --------------------------------------------------------- | |
| # Use PyTorch's fused scaled dot-product attention so large | |
| # score and softmax tensors do not need to be materialized. | |
| # --------------------------------------------------------- | |
| attention_scores = F.scaled_dot_product_attention( | |
| q, | |
| k, | |
| v, | |
| is_causal=is_causal, | |
| ) | |
| # --------------------------------------------------------- | |
| # Merge the attended heads and project the result back into | |
| # the model dimension for the next layer. | |
| # --------------------------------------------------------- | |
| merged_scores = self._merge_heads(attention_scores) | |
| return self.W_o(merged_scores) | |
| def forward_with_cache( | |
| self, | |
| encoding_for_q: torch.Tensor, | |
| encoding_for_k: torch.Tensor, | |
| encoding_for_v: torch.Tensor, | |
| past_key_value: LayerKeyValueCache | None, | |
| is_causal: bool = False, | |
| ) -> tuple[torch.Tensor, LayerKeyValueCache]: | |
| # --------------------------------------------------------- | |
| # Project the current tokens and append previous keys and | |
| # values so generation can avoid recomputing old states. | |
| # --------------------------------------------------------- | |
| q = self._split_heads(self.W_q(encoding_for_q)) | |
| current_k = self._split_heads(self.W_k(encoding_for_k)) | |
| current_v = self._split_heads(self.W_v(encoding_for_v)) | |
| k = current_k | |
| v = current_v | |
| if past_key_value is not None: | |
| past_k, past_v = past_key_value | |
| k = torch.cat((past_k, current_k), dim=2) | |
| v = torch.cat((past_v, current_v), dim=2) | |
| # --------------------------------------------------------- | |
| # Attend the current query positions over cached and current | |
| # keys with the fused scaled dot-product implementation. | |
| # --------------------------------------------------------- | |
| attention_scores = F.scaled_dot_product_attention( | |
| q, | |
| k, | |
| v, | |
| is_causal=is_causal, | |
| ) | |
| # --------------------------------------------------------- | |
| # Return both the attention result and the updated cache for | |
| # this layer so the caller can feed the next token directly. | |
| # --------------------------------------------------------- | |
| merged_scores = self._merge_heads(attention_scores) | |
| return self.W_o(merged_scores), (k, v) | |