Text Generation
Transformers
PyTorch
English
taonet_mini_t2
taonet
taotern
ssm
state-space-model
dplr
custom_code
experimental
Instructions to use TaoTern/TaoNet-mini-T2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use TaoTern/TaoNet-mini-T2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="TaoTern/TaoNet-mini-T2", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("TaoTern/TaoNet-mini-T2", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use TaoTern/TaoNet-mini-T2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "TaoTern/TaoNet-mini-T2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/TaoTern/TaoNet-mini-T2
- SGLang
How to use TaoTern/TaoNet-mini-T2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use TaoTern/TaoNet-mini-T2 with Docker Model Runner:
docker model run hf.co/TaoTern/TaoNet-mini-T2
| """Gamma SSM Block with residual connections and normalization.""" | |
| import torch | |
| import torch.nn as nn | |
| from typing import Optional, Tuple | |
| from .ssm_gamma import SSMGamma | |
| from .normalization import LayerNorm | |
| class GammaSingleBlock(nn.Module): | |
| """ | |
| Single Gamma SSM Block with residual connection and layer normalization. | |
| Performs: y = Block(LayerNorm(x)) + x (if prenorm=True) | |
| or y = LayerNorm(Block(x) + x) (if prenorm=False) | |
| Args: | |
| d_model: Model dimension | |
| hidden_dim: Hidden dimension for the SSM state | |
| delta_t: Time discretization step (default: 0.1) | |
| kernel_length: Convolution kernel length for future use (default: 4) | |
| A_type: Type of A matrix initialization (default: "tridiagonal") | |
| prenorm: Use prenorm (LayerNorm -> Block) vs postnorm (Block -> LayerNorm) (default: True) | |
| residual_scale: Scaling factor for residual connection (default: 1.0) | |
| dropout: Dropout rate after block (default: 0.0) | |
| Shape: | |
| - Input: (batch, seq_len, d_model) | |
| - Output: (batch, seq_len, d_model) | |
| """ | |
| def __init__( | |
| self, | |
| d_model: int, | |
| hidden_dim: int, | |
| delta_t: float = 0.1, | |
| kernel_length: int = 4, | |
| A_type: str = "tridiagonal", | |
| prenorm: bool = True, | |
| residual_scale: float = 1.0, | |
| dropout: float = 0.0, | |
| ): | |
| super().__init__() | |
| self.d_model = d_model | |
| self.prenorm = prenorm | |
| self.dropout_p = dropout | |
| self.residual_scale = residual_scale | |
| # Normalization | |
| self.norm = LayerNorm(d_model) | |
| # SSM block | |
| self.ssm = SSMGamma( | |
| state_dim=d_model, | |
| hidden_dim=hidden_dim, | |
| delta_t=delta_t, | |
| kernel_length=kernel_length, | |
| A_type=A_type, | |
| ) | |
| # Dropout | |
| if dropout > 0: | |
| self.dropout = nn.Dropout(dropout) | |
| else: | |
| self.dropout = None | |
| def forward( | |
| self, | |
| x: torch.Tensor, | |
| state: Optional[torch.Tensor] = None, | |
| mask: Optional[torch.Tensor] = None, | |
| return_state: bool = True, | |
| ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: | |
| """ | |
| Forward pass through block. | |
| Args: | |
| x: Input tensor (batch, seq_len, d_model) | |
| state: Optional initial hidden state (batch, hidden_dim) | |
| mask: Optional mask (batch, seq_len) for valid positions | |
| Returns: | |
| output: (batch, seq_len, d_model) | |
| final_state: Final hidden state from SSM (batch, hidden_dim) | |
| """ | |
| if self.prenorm: | |
| # Apply norm before SSM | |
| x_norm = self.norm(x) | |
| ssm_out, final_state = self.ssm(x_norm, mask=mask, state=state) | |
| else: | |
| # Apply SSM first, then norm | |
| ssm_out, final_state = self.ssm(x, mask=mask, state=state) | |
| ssm_out = self.norm(ssm_out) | |
| # Apply dropout if present | |
| if self.dropout is not None: | |
| ssm_out = self.dropout(ssm_out) | |
| # Residual connection with optional scaling | |
| output = x * self.residual_scale + ssm_out | |
| # Apply final norm if postnorm | |
| if not self.prenorm: | |
| output = self.norm(output) | |
| if not return_state: | |
| final_state = None | |
| return output, final_state | |
| def step(self, u: torch.Tensor, h: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: | |
| """ | |
| Single step inference through block (RNN style). | |
| Args: | |
| u: Input tensor (batch, d_model) - single timestep | |
| h: Hidden state (batch, hidden_dim) | |
| Returns: | |
| output: (batch, d_model) - block output | |
| h_new: (batch, hidden_dim) - new hidden state | |
| """ | |
| if self.prenorm: | |
| # Apply norm before SSM | |
| u_norm = self.norm(u) | |
| ssm_out, h_new = self.ssm.step(u_norm, h) | |
| else: | |
| # Apply SSM first, then norm | |
| ssm_out, h_new = self.ssm.step(u, h) | |
| ssm_out = self.norm(ssm_out) | |
| # Apply dropout if present | |
| if self.dropout is not None: | |
| ssm_out = self.dropout(ssm_out) | |
| # Residual connection with optional scaling | |
| output = u * self.residual_scale + ssm_out | |
| return output, h_new | |
| def allocate_inference_cache( | |
| self, | |
| batch_size: int, | |
| seq_len: int, | |
| device: torch.device, | |
| dtype: torch.dtype, | |
| ): | |
| """Allocate cache for efficient inference.""" | |
| return self.ssm.allocate_inference_cache(batch_size, seq_len, device, dtype) | |
| def allocate_deployment_cache( | |
| self, | |
| batch_size: int, | |
| seq_len: int, | |
| device: torch.device, | |
| dtype: torch.dtype, | |
| ): | |
| return self.allocate_inference_cache(batch_size, seq_len, device, dtype) | |
| def allocate_balanced_deployment_cache( | |
| self, | |
| batch_size: int, | |
| seq_len: int, | |
| device: torch.device, | |
| dtype: torch.dtype, | |
| ): | |
| return self.allocate_inference_cache(batch_size, seq_len, device, dtype) | |