Text Generation
Transformers
PyTorch
English
taonet_mini_t2
taonet
taotern
ssm
state-space-model
dplr
custom_code
experimental
Instructions to use TaoTern/TaoNet-mini-T2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use TaoTern/TaoNet-mini-T2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="TaoTern/TaoNet-mini-T2", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("TaoTern/TaoNet-mini-T2", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use TaoTern/TaoNet-mini-T2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "TaoTern/TaoNet-mini-T2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/TaoTern/TaoNet-mini-T2
- SGLang
How to use TaoTern/TaoNet-mini-T2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use TaoTern/TaoNet-mini-T2 with Docker Model Runner:
docker model run hf.co/TaoTern/TaoNet-mini-T2
| """Benchmark SSMGamma and GammaSingleBlock performance. | |
| Compares CUDA-optimized vs PyTorch implementations. | |
| Measures throughput, latency, and memory usage. | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| import time | |
| import argparse | |
| from pathlib import Path | |
| from typing import Dict, Tuple | |
| from gamma_space_model import SSMGamma, GammaSingleBlock, HAS_CUDA_OPS | |
| def benchmark_forward_pass( | |
| model: nn.Module, | |
| batch_size: int, | |
| seq_len: int, | |
| d_model: int, | |
| num_iterations: int = 100, | |
| warmup_iterations: int = 20, | |
| device: str = "cpu", | |
| dtype: torch.dtype = torch.float32, | |
| ) -> Dict[str, float]: | |
| """ | |
| Benchmark forward pass latency. | |
| Args: | |
| model: Model to benchmark | |
| batch_size: Batch size | |
| seq_len: Sequence length | |
| d_model: Model dimension (state_dim for SSMGamma) | |
| num_iterations: Number of iterations to benchmark | |
| warmup_iterations: Number of warmup iterations | |
| device: Device to run on | |
| dtype: Data type | |
| Returns: | |
| Dictionary with latency statistics (ms) | |
| """ | |
| model.eval() | |
| # Create dummy input | |
| x = torch.randn(batch_size, seq_len, d_model, dtype=dtype, device=device) | |
| print(f" Warming up for {warmup_iterations} iterations...") | |
| with torch.no_grad(): | |
| for _ in range(warmup_iterations): | |
| _ = model(x) | |
| if device == "cuda": | |
| torch.cuda.synchronize() | |
| print(f" Benchmarking {num_iterations} iterations...") | |
| # Forward timing | |
| times = [] | |
| with torch.no_grad(): | |
| for _ in range(num_iterations): | |
| torch.cuda.synchronize() if device == "cuda" else None | |
| start = time.perf_counter() | |
| _ = model(x) | |
| torch.cuda.synchronize() if device == "cuda" else None | |
| end = time.perf_counter() | |
| times.append((end - start) * 1000) # Convert to ms | |
| times = torch.tensor(times) | |
| return { | |
| "mean_latency_ms": times.mean().item(), | |
| "median_latency_ms": times.median().item(), | |
| "min_latency_ms": times.min().item(), | |
| "max_latency_ms": times.max().item(), | |
| "std_latency_ms": times.std().item(), | |
| "throughput_samples_per_sec": 1000.0 / times.mean().item() * batch_size, | |
| } | |
| def benchmark_memory( | |
| model: nn.Module, | |
| batch_size: int, | |
| seq_len: int, | |
| d_model: int, | |
| device: str = "cuda", | |
| ) -> Dict[str, float]: | |
| """ | |
| Benchmark memory usage. | |
| Args: | |
| model: Model to benchmark | |
| batch_size: Batch size | |
| seq_len: Sequence length | |
| d_model: Model dimension | |
| device: Device (cuda for GPU memory) | |
| Returns: | |
| Dictionary with memory statistics | |
| """ | |
| if device != "cuda": | |
| return {"gpu_memory_allocated_mb": 0.0, "gpu_memory_reserved_mb": 0.0} | |
| model.eval() | |
| # Clear cache | |
| torch.cuda.empty_cache() | |
| torch.cuda.reset_peak_memory_stats() | |
| # Create input | |
| x = torch.randn(batch_size, seq_len, d_model, device="cuda") | |
| # Forward pass | |
| with torch.no_grad(): | |
| _ = model(x) | |
| torch.cuda.synchronize() | |
| allocated = torch.cuda.memory_allocated() / (1024 ** 2) # MB | |
| reserved = torch.cuda.memory_reserved() / (1024 ** 2) # MB | |
| max_allocated = torch.cuda.max_memory_allocated() / (1024 ** 2) # MB | |
| return { | |
| "gpu_memory_allocated_mb": allocated, | |
| "gpu_memory_reserved_mb": reserved, | |
| "gpu_memory_max_allocated_mb": max_allocated, | |
| } | |
| def run_benchmarks(args): | |
| """Run comprehensive benchmarks.""" | |
| print("\n" + "=" * 80) | |
| print("SSM GAMMA BENCHMARK SUITE") | |
| print("=" * 80) | |
| device = "cuda" if torch.cuda.is_available() and args.device == "cuda" else "cpu" | |
| print(f"\nDevice: {device}") | |
| print(f"CUDA optimizations available: {HAS_CUDA_OPS}") | |
| # Test configurations | |
| configs = [ | |
| (4, 128, 64, "Small (seq=128)"), | |
| (8, 512, 128, "Medium (seq=512)"), | |
| (16, 2048, 256, "Large (seq=2048)"), | |
| ] | |
| results = {} | |
| for batch_size, seq_len, d_model, config_name in configs: | |
| print(f"\n{'-' * 80}") | |
| print(f"Configuration: {config_name}") | |
| print(f" Batch size: {batch_size}") | |
| print(f" Sequence length: {seq_len}") | |
| print(f" Model dimension: {d_model}") | |
| print(f" Total tokens: {batch_size * seq_len:,}") | |
| # SSMGamma benchmark | |
| print(f"\n SSMGamma benchmark:") | |
| ssm = SSMGamma(state_dim=d_model, hidden_dim=d_model * 2).to(device) | |
| ssm_results = benchmark_forward_pass( | |
| ssm, batch_size, seq_len, d_model, | |
| num_iterations=args.iterations, | |
| device=device, | |
| ) | |
| print(f" Latency: {ssm_results['mean_latency_ms']:.3f} ± {ssm_results['std_latency_ms']:.3f} ms") | |
| print(f" Throughput: {ssm_results['throughput_samples_per_sec']:.0f} tokens/sec") | |
| if device == "cuda": | |
| mem = benchmark_memory(ssm, batch_size, seq_len, d_model) | |
| print(f" GPU Memory: {mem['gpu_memory_max_allocated_mb']:.1f} MB") | |
| # GammaSingleBlock benchmark | |
| print(f"\n GammaSingleBlock benchmark:") | |
| block = GammaSingleBlock(d_model=d_model, hidden_dim=d_model * 2).to(device) | |
| block_results = benchmark_forward_pass( | |
| block, batch_size, seq_len, d_model, | |
| num_iterations=args.iterations, | |
| device=device, | |
| ) | |
| print(f" Latency: {block_results['mean_latency_ms']:.3f} ± {block_results['std_latency_ms']:.3f} ms") | |
| print(f" Throughput: {block_results['throughput_samples_per_sec']:.0f} tokens/sec") | |
| if device == "cuda": | |
| mem = benchmark_memory(block, batch_size, seq_len, d_model) | |
| print(f" GPU Memory: {mem['gpu_memory_max_allocated_mb']:.1f} MB") | |
| # Store results | |
| results[config_name] = { | |
| "ssm_gamma": ssm_results, | |
| "gamma_block": block_results, | |
| } | |
| print(f"\n{'=' * 80}") | |
| print("Benchmark complete!") | |
| print(f"{'=' * 80}\n") | |
| return results | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Benchmark SSM Gamma blocks") | |
| parser.add_argument( | |
| "--device", | |
| type=str, | |
| choices=["cpu", "cuda"], | |
| default="cuda" if torch.cuda.is_available() else "cpu", | |
| help="Device to benchmark on" | |
| ) | |
| parser.add_argument( | |
| "--iterations", | |
| type=int, | |
| default=100, | |
| help="Number of benchmark iterations" | |
| ) | |
| parser.add_argument( | |
| "--warmup", | |
| type=int, | |
| default=20, | |
| help="Number of warmup iterations" | |
| ) | |
| args = parser.parse_args() | |
| run_benchmarks(args) | |
| if __name__ == "__main__": | |
| main() | |