Text Generation
Transformers
PyTorch
English
taonet_mini_t2
taonet
taotern
ssm
state-space-model
dplr
custom_code
experimental
Instructions to use TaoTern/TaoNet-mini-T2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use TaoTern/TaoNet-mini-T2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="TaoTern/TaoNet-mini-T2", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("TaoTern/TaoNet-mini-T2", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use TaoTern/TaoNet-mini-T2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "TaoTern/TaoNet-mini-T2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/TaoTern/TaoNet-mini-T2
- SGLang
How to use TaoTern/TaoNet-mini-T2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "TaoTern/TaoNet-mini-T2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "TaoTern/TaoNet-mini-T2", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use TaoTern/TaoNet-mini-T2 with Docker Model Runner:
docker model run hf.co/TaoTern/TaoNet-mini-T2
| """Benchmarking suite for evaluating trained models.""" | |
| import time | |
| from pathlib import Path | |
| from typing import Optional, Dict | |
| import torch | |
| from torch.utils.data import DataLoader | |
| from taoTrain.core import BaseModel | |
| from taoTrain.config import TrainingConfig | |
| from taoTrain.data.loaders import get_dataloader | |
| from taoTrain.inference import Inferencer | |
| class BenchmarkRunner: | |
| """Run benchmarks on a trained model.""" | |
| def __init__( | |
| self, | |
| model: BaseModel, | |
| device: torch.device, | |
| dtype: torch.dtype = torch.float32, | |
| ): | |
| """ | |
| Initialize benchmark runner. | |
| Args: | |
| model: Trained model | |
| device: Device for inference | |
| dtype: Data type | |
| """ | |
| self.model = model.to(device) | |
| self.model.eval() | |
| self.device = device | |
| self.dtype = dtype | |
| def load_from_checkpoint( | |
| checkpoint_path: str | Path, | |
| device: Optional[torch.device] = None, | |
| ) -> "BenchmarkRunner": | |
| """Load model from checkpoint.""" | |
| if device is None: | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| checkpoint = torch.load(checkpoint_path, map_location=device) | |
| # Reconstruct model config | |
| from taoTrain.config import ModelConfig | |
| from taoTrain.models import get_model | |
| model_config = ModelConfig(**checkpoint.get("config", {}).get("model", {})) | |
| model = get_model(model_config, device=device) | |
| model.load_state_dict(checkpoint["model_state_dict"]) | |
| return BenchmarkRunner(model, device) | |
| def benchmark_perplexity( | |
| self, | |
| dataset: "DataLoader", | |
| num_batches: Optional[int] = None, | |
| ) -> float: | |
| """ | |
| Compute perplexity on a dataset. | |
| Args: | |
| dataset: DataLoader for evaluation | |
| num_batches: Limit evaluation to N batches | |
| Returns: | |
| Perplexity (exp of average loss) | |
| """ | |
| total_loss = 0.0 | |
| total_tokens = 0 | |
| with torch.no_grad(): | |
| for batch_idx, batch in enumerate(dataset): | |
| if num_batches and batch_idx >= num_batches: | |
| break | |
| # Move to device | |
| input_ids = batch["input_ids"].to(self.device) | |
| attention_mask = batch.get("attention_mask") | |
| if attention_mask is not None: | |
| attention_mask = attention_mask.to(self.device) | |
| labels = batch.get("labels") | |
| if labels is not None: | |
| labels = labels.to(self.device) | |
| # Forward pass | |
| with torch.autocast( | |
| device_type="cuda" if self.device.type == "cuda" else "cpu", | |
| dtype=torch.bfloat16 if self.dtype == torch.bfloat16 else torch.float32, | |
| ): | |
| outputs = self.model( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| labels=labels, | |
| ) | |
| loss = outputs.get("loss") | |
| if loss is not None: | |
| total_loss += loss.item() * input_ids.shape[0] | |
| total_tokens += input_ids.shape[0] | |
| avg_loss = total_loss / total_tokens if total_tokens > 0 else float('inf') | |
| perplexity = torch.exp(torch.tensor(avg_loss)).item() | |
| return perplexity | |
| def benchmark_throughput( | |
| self, | |
| batch_size: int = 32, | |
| seq_length: int = 1024, | |
| num_iters: int = 10, | |
| ) -> Dict[str, float]: | |
| """ | |
| Benchmark forward pass throughput. | |
| Args: | |
| batch_size: Batch size | |
| seq_length: Sequence length | |
| num_iters: Number of iterations | |
| Returns: | |
| Dict with throughput metrics | |
| """ | |
| # Create dummy batch | |
| dummy_input = torch.randint( | |
| 0, self.model.config.vocab_size, | |
| (batch_size, seq_length) | |
| ).to(self.device) | |
| # Warmup | |
| with torch.no_grad(): | |
| for _ in range(2): | |
| _ = self.model(dummy_input) | |
| torch.cuda.synchronize() if torch.cuda.is_available() else None | |
| # Benchmark forward pass | |
| start = time.time() | |
| with torch.no_grad(): | |
| for _ in range(num_iters): | |
| _ = self.model(dummy_input) | |
| torch.cuda.synchronize() if torch.cuda.is_available() else None | |
| elapsed = time.time() - start | |
| total_tokens = batch_size * seq_length * num_iters | |
| tokens_per_sec = total_tokens / elapsed | |
| return { | |
| "throughput_tokens_per_sec": tokens_per_sec, | |
| "throughput_samples_per_sec": (batch_size * num_iters) / elapsed, | |
| "avg_time_per_iter_ms": (elapsed / num_iters) * 1000, | |
| } | |
| def benchmark_memory(self) -> Dict[str, float]: | |
| """ | |
| Benchmark peak GPU memory usage. | |
| Returns: | |
| Dict with memory stats | |
| """ | |
| if not torch.cuda.is_available(): | |
| return {"peak_memory_gb": 0.0} | |
| torch.cuda.reset_peak_memory_stats() | |
| torch.cuda.synchronize() | |
| # Create dummy batch | |
| dummy_input = torch.randint( | |
| 0, self.model.config.vocab_size, | |
| (16, 1024) | |
| ).to(self.device) | |
| with torch.no_grad(): | |
| _ = self.model(dummy_input) | |
| torch.cuda.synchronize() | |
| peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 3) # GB | |
| return {"peak_memory_gb": peak_memory} | |
| def run_all_benchmarks( | |
| self, | |
| dataset: Optional["DataLoader"] = None, | |
| batch_size: int = 32, | |
| seq_length: int = 1024, | |
| ) -> Dict[str, float]: | |
| """ | |
| Run all benchmarks. | |
| Args: | |
| dataset: DataLoader for perplexity benchmark | |
| batch_size: Batch size for throughput benchmark | |
| seq_length: Sequence length for throughput benchmark | |
| Returns: | |
| Dict with all benchmark results | |
| """ | |
| results = {} | |
| if dataset is not None: | |
| print("Running perplexity benchmark...") | |
| ppl = self.benchmark_perplexity(dataset, num_batches=10) | |
| results["perplexity"] = ppl | |
| print("Running throughput benchmark...") | |
| throughput = self.benchmark_throughput(batch_size, seq_length) | |
| results.update(throughput) | |
| print("Running memory benchmark...") | |
| memory = self.benchmark_memory() | |
| results.update(memory) | |
| return results | |