Text Generation
PyTorch
Transformers
English
language-model
graph-neural-network
sparse-attention
adaptive-depth
temporal-decay
mesh-attention
efficient-transformer
novel-architecture
causal-lm
research
preprint
mesh-transformer
dynamic-graph
early-exit
per-token-routing
Eval Results (legacy)
Instructions to use vigneshwar234/TemporalMesh-Transformer with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use vigneshwar234/TemporalMesh-Transformer with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="vigneshwar234/TemporalMesh-Transformer")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("vigneshwar234/TemporalMesh-Transformer", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use vigneshwar234/TemporalMesh-Transformer with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "vigneshwar234/TemporalMesh-Transformer" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "vigneshwar234/TemporalMesh-Transformer", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/vigneshwar234/TemporalMesh-Transformer
- SGLang
How to use vigneshwar234/TemporalMesh-Transformer with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "vigneshwar234/TemporalMesh-Transformer" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "vigneshwar234/TemporalMesh-Transformer", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "vigneshwar234/TemporalMesh-Transformer" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "vigneshwar234/TemporalMesh-Transformer", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use vigneshwar234/TemporalMesh-Transformer with Docker Model Runner:
docker model run hf.co/vigneshwar234/TemporalMesh-Transformer
| """ | |
| TemporalMesh Transformer — Inference Script | |
| Full greedy / top-p / top-k text generation with exit gate analysis. | |
| """ | |
| import torch | |
| import torch.nn.functional as F | |
| from tmt.model.config import TMTConfig | |
| from tmt.model.model import TMTModel | |
| def load_model(checkpoint_path: str = None, config: TMTConfig = None) -> TMTModel: | |
| if config is None: | |
| config = TMTConfig( | |
| vocab_size=50258, d_model=512, n_heads=8, n_layers=12, | |
| graph_k=8, exit_threshold=0.85, memory_anchors=16, max_seq_len=256, | |
| ) | |
| model = TMTModel(config) | |
| if checkpoint_path: | |
| ckpt = torch.load(checkpoint_path, map_location="cpu") | |
| model.load_state_dict(ckpt["model_state"]) | |
| model.eval() | |
| return model | |
| def generate( | |
| model: TMTModel, | |
| input_ids: torch.Tensor, | |
| max_new_tokens: int = 64, | |
| temperature: float = 1.0, | |
| top_k: int = 50, | |
| top_p: float = 0.95, | |
| do_sample: bool = True, | |
| ) -> dict: | |
| """ | |
| Generate tokens autoregressively. Returns generated ids + exit analysis. | |
| """ | |
| device = next(model.parameters()).device | |
| input_ids = input_ids.to(device) | |
| generated = input_ids.clone() | |
| all_exit_stats = [] | |
| for _ in range(max_new_tokens): | |
| output = model(generated) | |
| logits = output.logits[:, -1, :] / temperature # (B, V) | |
| if top_k > 0: | |
| values, _ = torch.topk(logits, top_k) | |
| logits[logits < values[:, -1:]] = -float("Inf") | |
| if top_p < 1.0: | |
| sorted_logits, sorted_idx = torch.sort(logits, descending=True) | |
| cumulative = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) | |
| remove = cumulative - F.softmax(sorted_logits, dim=-1) > top_p | |
| remove[:, 1:] = remove[:, :-1].clone() | |
| remove[:, 0] = False | |
| sorted_logits[remove] = -float("Inf") | |
| logits.scatter_(1, sorted_idx, sorted_logits) | |
| probs = F.softmax(logits, dim=-1) | |
| next_token = ( | |
| torch.multinomial(probs, num_samples=1) if do_sample | |
| else logits.argmax(dim=-1, keepdim=True) | |
| ) | |
| generated = torch.cat([generated, next_token], dim=1) | |
| # capture exit stats for this step | |
| step_exit = { | |
| "exit_rates": [m.float().mean().item() for m in output.exit_masks], | |
| "avg_confidence": [c.mean().item() for c in output.confidences], | |
| } | |
| all_exit_stats.append(step_exit) | |
| # stop at max_seq_len | |
| if generated.shape[1] >= model.config.max_seq_len: | |
| break | |
| avg_compute = sum( | |
| sum(s["exit_rates"]) / len(s["exit_rates"]) | |
| for s in all_exit_stats | |
| ) / len(all_exit_stats) | |
| return { | |
| "generated_ids": generated, | |
| "new_tokens": generated[:, input_ids.shape[1]:], | |
| "exit_stats": all_exit_stats, | |
| "avg_compute_used": round(avg_compute, 3), | |
| } | |
| def analyse_sequence(model: TMTModel, input_ids: torch.Tensor) -> None: | |
| """ | |
| Run a single forward pass and print detailed exit gate analysis. | |
| """ | |
| device = next(model.parameters()).device | |
| with torch.no_grad(): | |
| output = model(input_ids.to(device)) | |
| S = input_ids.shape[1] | |
| print(f"\n{'='*55}") | |
| print(f" TMT Sequence Analysis (seq_len={S})") | |
| print(f"{'='*55}") | |
| print(f" Logits shape: {output.logits.shape}") | |
| print(f" Graph edges: {output.graph_edges[0].shape[1]} active edges") | |
| print(f" Memory state: {output.memory_state.shape}\n") | |
| print(f" {'Layer':<8} {'Tokens frozen':>14} {'Exit rate':>12} {'Avg conf':>10}") | |
| print(f" {'-'*46}") | |
| total_frozen = 0 | |
| for i, (mask, conf) in enumerate(zip(output.exit_masks, output.confidences)): | |
| n_frozen = mask.sum().item() | |
| total_frozen += n_frozen | |
| rate = n_frozen / S | |
| avg_c = conf.mean().item() | |
| print(f" {i+1:<8} {n_frozen:>14} {rate:>11.1%} {avg_c:>10.3f}") | |
| print(f" {'-'*46}") | |
| print(f" Total compute fraction: {total_frozen/(S*len(output.exit_masks)):.1%} of max") | |
| print(f" Active graph edges: {output.graph_edges[0].shape[1]}") | |
| print(f"{'='*55}\n") | |
| if __name__ == "__main__": | |
| print("Loading TMT-Small for quick demo...") | |
| cfg = TMTConfig( | |
| vocab_size=50258, d_model=256, n_heads=4, n_layers=6, | |
| graph_k=4, exit_threshold=0.80, memory_anchors=8, max_seq_len=128, | |
| ) | |
| model = load_model(config=cfg) | |
| print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}") | |
| ids = torch.randint(100, 50000, (1, 32)) | |
| analyse_sequence(model, ids) | |
| result = generate(model, ids, max_new_tokens=16, do_sample=False) | |
| print(f"Generated {result['new_tokens'].shape[1]} new tokens.") | |
| print(f"Avg compute used per step: {result['avg_compute_used']:.1%}") | |