Text Generation
Transformers
PyTorch
English
taonet_mini_t2
taonet
taotern
ssm
state-space-model
dplr
custom_code
experimental
Instructions to use TaoTern/TaoNet-mini-T2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use TaoTern/TaoNet-mini-T2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="TaoTern/TaoNet-mini-T2", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("TaoTern/TaoNet-mini-T2", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use TaoTern/TaoNet-mini-T2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "TaoTern/TaoNet-mini-T2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/TaoTern/TaoNet-mini-T2
- SGLang
How to use TaoTern/TaoNet-mini-T2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use TaoTern/TaoNet-mini-T2 with Docker Model Runner:
docker model run hf.co/TaoTern/TaoNet-mini-T2
| """Probe residual activation scale for a saved TaoTrain checkpoint.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| import torch | |
| REPO_ROOT = Path(__file__).resolve().parents[2] | |
| SRC_ROOT = REPO_ROOT / "src" | |
| if str(SRC_ROOT) not in sys.path: | |
| sys.path.insert(0, str(SRC_ROOT)) | |
| from taoTrain.checkpointing.checkpoint import CheckpointManager | |
| from taoTrain.config import ModelConfig | |
| from taoTrain.models import get_model | |
| def load_sentencepiece(path: Path): | |
| import sentencepiece as spm | |
| processor = spm.SentencePieceProcessor() | |
| processor.load(str(path)) | |
| return processor | |
| def load_tokens(args: argparse.Namespace) -> tuple[torch.Tensor, int]: | |
| tokenizer = load_sentencepiece(Path(args.tokenizer_path)) | |
| tokens: list[int] = [] | |
| with Path(args.data_path).open("r", encoding="utf-8", errors="replace") as handle: | |
| for line in handle: | |
| if len(tokens) >= args.max_tokens: | |
| break | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| record = json.loads(line) | |
| except json.JSONDecodeError: | |
| continue | |
| text = record.get(args.text_field) | |
| if not isinstance(text, str) or not text: | |
| continue | |
| ids = list(tokenizer.encode(text, out_type=int)) | |
| eos_id = tokenizer.eos_id() | |
| if eos_id >= 0: | |
| ids.append(eos_id) | |
| tokens.extend(ids) | |
| if len(tokens) < args.seq_len + 2: | |
| raise ValueError(f"Need at least {args.seq_len + 2} tokens, got {len(tokens)}") | |
| return torch.tensor(tokens[: args.max_tokens], dtype=torch.long), int(tokenizer.vocab_size()) | |
| def sample_batch(tokens: torch.Tensor, *, batch_size: int, seq_len: int, device: torch.device) -> tuple[torch.Tensor, torch.Tensor]: | |
| max_start = tokens.numel() - seq_len - 1 | |
| starts = torch.linspace(0, max_start - 1, steps=batch_size).long() | |
| rows = [tokens[int(start) : int(start) + seq_len + 1] for start in starts] | |
| batch = torch.stack(rows, dim=0).to(device=device) | |
| return batch[:, :-1].contiguous(), batch[:, 1:].contiguous() | |
| def tensor_stats(value: torch.Tensor) -> dict[str, float | int]: | |
| data = value.detach().float() | |
| finite = torch.isfinite(data) | |
| finite_count = int(finite.sum().cpu()) | |
| numel = data.numel() | |
| if finite_count: | |
| finite_data = data[finite] | |
| rms = float(torch.sqrt(torch.mean(finite_data * finite_data)).cpu()) | |
| max_abs = float(finite_data.abs().max().cpu()) | |
| else: | |
| rms = float("inf") | |
| max_abs = float("inf") | |
| return { | |
| "numel": numel, | |
| "finite": finite_count, | |
| "rms": rms, | |
| "max_abs": max_abs, | |
| } | |
| def main() -> None: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--checkpoint", required=True) | |
| parser.add_argument("--tokenizer-path", required=True) | |
| parser.add_argument("--data-path", required=True) | |
| parser.add_argument("--text-field", default="text") | |
| parser.add_argument("--output", required=True) | |
| parser.add_argument("--batch-size", type=int, default=2) | |
| parser.add_argument("--seq-len", type=int, default=512) | |
| parser.add_argument("--max-tokens", type=int, default=200_000) | |
| parser.add_argument("--device", default="cuda") | |
| parser.add_argument("--dtype", choices=["float32", "bfloat16", "float16"], default="bfloat16") | |
| args = parser.parse_args() | |
| device = torch.device(args.device if args.device == "cpu" or torch.cuda.is_available() else "cpu") | |
| dtype = { | |
| "float32": torch.float32, | |
| "bfloat16": torch.bfloat16, | |
| "float16": torch.float16, | |
| }[args.dtype] | |
| tokens, _ = load_tokens(args) | |
| input_ids, labels = sample_batch(tokens, batch_size=args.batch_size, seq_len=args.seq_len, device=device) | |
| attention_mask = torch.ones_like(input_ids) | |
| checkpoint_path = Path(args.checkpoint) | |
| checkpoint = CheckpointManager(checkpoint_path.parent).load(checkpoint_path, device=device) | |
| config_dict = checkpoint.get("config", {}) | |
| model_config = ModelConfig(**config_dict.get("model", {})) | |
| model = get_model(model_config, device=device) | |
| model.load_state_dict(checkpoint["model_state"], strict=False) | |
| model.eval() | |
| layer_stats: dict[str, dict[str, float | int]] = {} | |
| handles = [] | |
| layer_pattern = re.compile(r"^(?:model\.)?(?:layers|blocks)\.\d+$") | |
| def make_hook(name: str): | |
| def hook(_module, _inputs, output): | |
| value = output[0] if isinstance(output, tuple) else output | |
| if torch.is_tensor(value): | |
| layer_stats[name] = tensor_stats(value) | |
| return hook | |
| for name, module in model.named_modules(): | |
| if layer_pattern.match(name): | |
| handles.append(module.register_forward_hook(make_hook(name))) | |
| device_type = "cuda" if device.type == "cuda" else "cpu" | |
| autocast_enabled = device.type == "cuda" and dtype in {torch.float16, torch.bfloat16} | |
| with torch.no_grad(), torch.autocast(device_type=device_type, dtype=dtype, enabled=autocast_enabled): | |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) | |
| for handle in handles: | |
| handle.remove() | |
| result: dict[str, Any] = { | |
| "checkpoint": str(checkpoint_path), | |
| "loss": float(outputs["loss"].detach().cpu()), | |
| "batch_size": args.batch_size, | |
| "seq_len": args.seq_len, | |
| "device": str(device), | |
| "dtype": str(dtype), | |
| "layers": layer_stats, | |
| } | |
| output_path = Path(args.output) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| output_path.write_text(json.dumps(result, indent=2), encoding="utf-8") | |
| print(json.dumps(result, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |