Instructions to use MK0727/lambda-160m with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use MK0727/lambda-160m with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="MK0727/lambda-160m", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("MK0727/lambda-160m", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use MK0727/lambda-160m with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "MK0727/lambda-160m" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/MK0727/lambda-160m
- SGLang
How to use MK0727/lambda-160m with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "MK0727/lambda-160m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "MK0727/lambda-160m" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "MK0727/lambda-160m", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use MK0727/lambda-160m with Docker Model Runner:
docker model run hf.co/MK0727/lambda-160m
| import torch | |
| import torch.nn as nn | |
| from transformers import PreTrainedModel | |
| from transformers.generation import GenerationMixin | |
| from transformers.modeling_outputs import CausalLMOutputWithPast | |
| from .configuration_myllm import MyLLMConfig | |
| from .kv_cache import KeyValueCache | |
| from .position_encoding import PositionEncoding | |
| from .self_attention import Attention | |
| from .transformer import DecoderOnlyTransformer | |
| # --------------------------------------------------------- | |
| # Reference nested remote-code dependencies directly so local | |
| # AutoModel loading copies every file needed by relative imports. | |
| # --------------------------------------------------------- | |
| REMOTE_CODE_DEPENDENCIES = (Attention, PositionEncoding) | |
| class MyLLMForCausalLM(PreTrainedModel, GenerationMixin): | |
| config_class = MyLLMConfig | |
| main_input_name = "input_ids" | |
| _tied_weights_keys = {"transformer.fc_layer.weight": "transformer.we.weight"} | |
| def __init__(self, config: MyLLMConfig) -> None: | |
| super().__init__(config) | |
| # --------------------------------------------------------- | |
| # Reuse the existing PyTorch Transformer implementation and | |
| # keep the HF wrapper responsible only for AutoModel APIs. | |
| # --------------------------------------------------------- | |
| self.transformer = DecoderOnlyTransformer( | |
| num_tokens=config.vocab_size, | |
| d_model=config.d_model, | |
| max_len=config.max_len, | |
| num_layers=config.num_layers, | |
| num_heads=config.num_heads, | |
| d_ff=config.d_ff, | |
| learning_rate=config.learning_rate, | |
| pad_token_id=config.pad_token_id, | |
| ) | |
| self.post_init() | |
| def get_input_embeddings(self) -> nn.Embedding: | |
| # --------------------------------------------------------- | |
| # Expose input embeddings through the standard Transformers | |
| # interface used by resizing and generation helpers. | |
| # --------------------------------------------------------- | |
| return self.transformer.we | |
| def set_input_embeddings(self, value: nn.Embedding) -> None: | |
| # --------------------------------------------------------- | |
| # Keep tied output weights aligned when callers replace the | |
| # token embedding module through the Transformers interface. | |
| # --------------------------------------------------------- | |
| self.transformer.we = value | |
| self.transformer.fc_layer.weight = value.weight | |
| def get_output_embeddings(self) -> nn.Linear: | |
| # --------------------------------------------------------- | |
| # Expose the tied LM head through the standard Transformers | |
| # interface used by causal language model utilities. | |
| # --------------------------------------------------------- | |
| return self.transformer.fc_layer | |
| def set_output_embeddings(self, value: nn.Linear) -> None: | |
| # --------------------------------------------------------- | |
| # Allow Transformers utilities to replace the LM head while | |
| # preserving the module expected by the existing model. | |
| # --------------------------------------------------------- | |
| self.transformer.fc_layer = value | |
| def _supports_default_dynamic_cache(self) -> bool: | |
| # --------------------------------------------------------- | |
| # Use the existing list-based KV cache instead of letting | |
| # Transformers allocate its DynamicCache implementation. | |
| # --------------------------------------------------------- | |
| return False | |
| def prepare_inputs_for_generation( | |
| self, | |
| input_ids: torch.Tensor, | |
| past_key_values: KeyValueCache | None = None, | |
| **kwargs: object, | |
| ) -> dict[str, torch.Tensor | KeyValueCache | bool | None]: | |
| # --------------------------------------------------------- | |
| # Feed only the newest token after the cache is populated so | |
| # generate can reuse the existing incremental forward path. | |
| # --------------------------------------------------------- | |
| del kwargs | |
| model_input_ids = input_ids[:, -1:] if past_key_values is not None else input_ids | |
| return { | |
| "input_ids": model_input_ids, | |
| "past_key_values": past_key_values, | |
| "use_cache": True, | |
| } | |
| def forward( | |
| self, | |
| input_ids: torch.Tensor | None = None, | |
| attention_mask: torch.Tensor | None = None, | |
| labels: torch.Tensor | None = None, | |
| past_key_values: KeyValueCache | None = None, | |
| use_cache: bool | None = None, | |
| return_dict: bool | None = None, | |
| **kwargs: object, | |
| ) -> CausalLMOutputWithPast | tuple[torch.Tensor, ...]: | |
| # --------------------------------------------------------- | |
| # Accept the standard AutoModelForCausalLM argument names and | |
| # delegate the actual tensor computation to the PyTorch model. | |
| # --------------------------------------------------------- | |
| del attention_mask, kwargs | |
| if input_ids is None: | |
| raise ValueError("input_ids is required") | |
| should_use_cache = bool(use_cache) | |
| if past_key_values is not None or should_use_cache: | |
| logits, next_key_values = self.transformer.forward_with_cache( | |
| token_ids=input_ids, | |
| past_key_values=past_key_values, | |
| ) | |
| else: | |
| logits = self.transformer(token_ids=input_ids) | |
| next_key_values = None | |
| # --------------------------------------------------------- | |
| # Follow causal LM convention for labels supplied by HF | |
| # Trainer and examples: predict token n+1 from position n. | |
| # --------------------------------------------------------- | |
| loss = None | |
| if labels is not None: | |
| shift_logits = logits[:, :-1, :].contiguous() | |
| shift_labels = labels[:, 1:].contiguous() | |
| loss = nn.functional.cross_entropy( | |
| shift_logits.view(-1, self.config.vocab_size), | |
| shift_labels.view(-1), | |
| ignore_index=self.config.pad_token_id, | |
| ) | |
| # --------------------------------------------------------- | |
| # Return either the standard modeling output or a tuple for | |
| # callers that explicitly disable dictionary-style outputs. | |
| # --------------------------------------------------------- | |
| if return_dict is False: | |
| output = (logits,) | |
| return (loss, *output) if loss is not None else output | |
| return CausalLMOutputWithPast( | |
| loss=loss, | |
| logits=logits, | |
| past_key_values=next_key_values, | |
| ) | |