Text Generation
Transformers
PyTorch
English
taonet_mini_t2
taonet
taotern
ssm
state-space-model
dplr
custom_code
experimental
Instructions to use TaoTern/TaoNet-mini-T2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use TaoTern/TaoNet-mini-T2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="TaoTern/TaoNet-mini-T2", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("TaoTern/TaoNet-mini-T2", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use TaoTern/TaoNet-mini-T2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "TaoTern/TaoNet-mini-T2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/TaoTern/TaoNet-mini-T2
- SGLang
How to use TaoTern/TaoNet-mini-T2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use TaoTern/TaoNet-mini-T2 with Docker Model Runner:
docker model run hf.co/TaoTern/TaoNet-mini-T2
| """Profile the DPLR convolutional frequency path. | |
| This is a small remote-friendly profiler for choosing TileLang/Triton kernel | |
| targets. It focuses on S4TernaryDPLRSSM rather than the older Gamma fallback | |
| because this is the SSM core used by the TaoNet comparison work. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import sys | |
| import time | |
| from contextlib import nullcontext | |
| from pathlib import Path | |
| from typing import Any | |
| import torch | |
| REPO_ROOT = Path(__file__).resolve().parents[1] | |
| if str(REPO_ROOT) not in sys.path: | |
| sys.path.insert(0, str(REPO_ROOT)) | |
| from gamma_space_model import S4TernaryDPLRSSM | |
| DTYPES = { | |
| "fp32": torch.float32, | |
| "float32": torch.float32, | |
| "bf16": torch.bfloat16, | |
| "bfloat16": torch.bfloat16, | |
| "fp16": torch.float16, | |
| "float16": torch.float16, | |
| } | |
| def synchronize(device: torch.device) -> None: | |
| if device.type == "cuda": | |
| torch.cuda.synchronize(device) | |
| def memory_stats(device: torch.device) -> dict[str, float | None]: | |
| if device.type != "cuda": | |
| return {"peak_allocated_mb": None, "peak_reserved_mb": None} | |
| return { | |
| "peak_allocated_mb": torch.cuda.max_memory_allocated(device) / (1024**2), | |
| "peak_reserved_mb": torch.cuda.max_memory_reserved(device) / (1024**2), | |
| } | |
| def run_timed(fn, *, device: torch.device, warmup: int, repeats: int) -> dict[str, float]: | |
| for _ in range(warmup): | |
| fn() | |
| synchronize(device) | |
| latencies = [] | |
| for _ in range(repeats): | |
| if device.type == "cuda": | |
| torch.cuda.reset_peak_memory_stats(device) | |
| synchronize(device) | |
| start = time.perf_counter() | |
| fn() | |
| synchronize(device) | |
| latencies.append(time.perf_counter() - start) | |
| return { | |
| "mean_ms": sum(latencies) / len(latencies) * 1000.0, | |
| "min_ms": min(latencies) * 1000.0, | |
| } | |
| def profiler_table(prof: torch.profiler.profile, row_limit: int) -> list[dict[str, Any]]: | |
| rows = [] | |
| for event in prof.key_averages().table( | |
| sort_by="cuda_time_total", | |
| row_limit=row_limit, | |
| ).splitlines(): | |
| rows.append({"row": event}) | |
| return rows | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") | |
| parser.add_argument("--dtype", choices=sorted(DTYPES), default="bf16") | |
| parser.add_argument("--batch-size", type=int, default=4) | |
| parser.add_argument("--seq-len", type=int, default=512) | |
| parser.add_argument("--d-model", type=int, default=64) | |
| parser.add_argument("--hidden-dim", type=int, default=256) | |
| parser.add_argument("--rank", type=int, default=1) | |
| parser.add_argument("--warmup", type=int, default=2) | |
| parser.add_argument("--repeats", type=int, default=5) | |
| parser.add_argument("--profile", action="store_true") | |
| parser.add_argument("--row-limit", type=int, default=20) | |
| parser.add_argument("--method", choices=["forward", "direct", "transfer"], default="forward") | |
| parser.add_argument("--output", type=Path, default=None) | |
| args = parser.parse_args() | |
| device = torch.device(args.device) | |
| dtype = DTYPES[args.dtype] | |
| model = S4TernaryDPLRSSM( | |
| state_dim=args.d_model, | |
| hidden_dim=args.hidden_dim, | |
| rank=args.rank, | |
| kernel_mode="conv", | |
| kernel_threshold=1, | |
| ).to(device=device) | |
| model.train() | |
| x = torch.randn(args.batch_size, args.seq_len, args.d_model, device=device, dtype=dtype) | |
| autocast_enabled = device.type == "cuda" and dtype in {torch.float16, torch.bfloat16} | |
| def autocast_context(): | |
| if not autocast_enabled: | |
| return nullcontext() | |
| return torch.autocast(device_type=device.type, dtype=dtype, enabled=True) | |
| def apply_model() -> torch.Tensor: | |
| if args.method == "forward": | |
| y, _ = model(x, return_state=False) | |
| return y | |
| fft_dtype = torch.float32 if x.dtype in {torch.float16, torch.bfloat16} else x.dtype | |
| fft_len = 1 << max(1, (2 * args.seq_len - 1).bit_length()) | |
| with torch.autocast(device_type=device.type, enabled=False): | |
| u_channels = x.transpose(1, 2).to(dtype=fft_dtype) | |
| u_f = torch.fft.rfft(u_channels, n=fft_len) | |
| if args.method == "direct": | |
| y_f = model._apply_frequency_response( | |
| u_f=u_f, | |
| seq_len=args.seq_len, | |
| fft_len=fft_len, | |
| dtype=fft_dtype, | |
| device=device, | |
| ) | |
| else: | |
| transfer = model._compute_frequency_response( | |
| seq_len=args.seq_len, | |
| fft_len=fft_len, | |
| dtype=fft_dtype, | |
| device=device, | |
| use_cache=False, | |
| ) | |
| y_f = torch.einsum("foi,bif->bof", transfer, u_f) | |
| y = torch.fft.irfft(y_f, n=fft_len)[..., : args.seq_len] | |
| return y.transpose(1, 2).to(dtype=x.dtype) | |
| def forward_only() -> None: | |
| with torch.no_grad(): | |
| with autocast_context(): | |
| y = apply_model() | |
| y.sum().item() | |
| def forward_backward() -> None: | |
| model.zero_grad(set_to_none=True) | |
| with autocast_context(): | |
| y = apply_model() | |
| loss = y.square().mean() | |
| loss.backward() | |
| forward_stats = run_timed( | |
| forward_only, | |
| device=device, | |
| warmup=args.warmup, | |
| repeats=args.repeats, | |
| ) | |
| forward_backward_stats = run_timed( | |
| forward_backward, | |
| device=device, | |
| warmup=args.warmup, | |
| repeats=args.repeats, | |
| ) | |
| tokens = args.batch_size * args.seq_len | |
| report: dict[str, Any] = { | |
| "config": vars(args) | {"device": str(device), "dtype": str(dtype).replace("torch.", "")}, | |
| "forward": { | |
| **forward_stats, | |
| "tokens_per_s": tokens / max(forward_stats["mean_ms"] / 1000.0, 1e-12), | |
| }, | |
| "forward_backward": { | |
| **forward_backward_stats, | |
| "tokens_per_s": tokens / max(forward_backward_stats["mean_ms"] / 1000.0, 1e-12), | |
| **memory_stats(device), | |
| }, | |
| "frequency_grid_cache_entries": len(model._frequency_grid_cache), | |
| } | |
| if args.profile: | |
| activities = [torch.profiler.ProfilerActivity.CPU] | |
| if device.type == "cuda": | |
| activities.append(torch.profiler.ProfilerActivity.CUDA) | |
| with torch.profiler.profile(activities=activities, record_shapes=True) as prof: | |
| forward_backward() | |
| report["profiler_table"] = profiler_table(prof, args.row_limit) | |
| text = json.dumps(report, indent=2, sort_keys=True, default=str) | |
| print(text) | |
| if args.output is not None: | |
| args.output.parent.mkdir(parents=True, exist_ok=True) | |
| args.output.write_text(text, encoding="utf-8") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |