Text Generation
Transformers
ONNX
Safetensors
English
qwen2
dictation
cleanup
transcript
lora
mumble
conversational
text-generation-inference
Instructions to use adikuma/mumble-cleanup with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use adikuma/mumble-cleanup with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="adikuma/mumble-cleanup") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("adikuma/mumble-cleanup") model = AutoModelForCausalLM.from_pretrained("adikuma/mumble-cleanup") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use adikuma/mumble-cleanup with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "adikuma/mumble-cleanup" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/adikuma/mumble-cleanup
- SGLang
How to use adikuma/mumble-cleanup with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "adikuma/mumble-cleanup" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "adikuma/mumble-cleanup" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use adikuma/mumble-cleanup with Docker Model Runner:
docker model run hf.co/adikuma/mumble-cleanup
| # cpu latency for the exported onnx, mirroring privacy-filter/eval/latency.py. | |
| # RUN THIS LOCALLY on the target laptop. gpu timings are not informative | |
| # because deployment is cpu-only via ort. | |
| # | |
| # two benchmarks: | |
| # - fixed length sweep: synthesize prompts of N tokens, time N=16..512 | |
| # - realistic mix: use real test rows, time variable-length inputs | |
| import statistics | |
| import time | |
| from pathlib import Path | |
| from typing import Optional | |
| import numpy as np | |
| import onnxruntime as ort | |
| from transformers import AutoTokenizer | |
| def _session(onnx_path: Path, intra_op_threads: int) -> ort.InferenceSession: | |
| opts = ort.SessionOptions() | |
| opts.intra_op_num_threads = intra_op_threads | |
| opts.inter_op_num_threads = 1 | |
| opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL | |
| return ort.InferenceSession(str(onnx_path), opts, providers=["CPUExecutionProvider"]) | |
| def _percentiles(values: list[float]) -> dict: | |
| if not values: | |
| return {"p50_ms": 0.0, "p95_ms": 0.0, "p99_ms": 0.0, "mean_ms": 0.0} | |
| arr = np.asarray(values) | |
| return { | |
| "p50_ms": float(np.percentile(arr, 50)), | |
| "p95_ms": float(np.percentile(arr, 95)), | |
| "p99_ms": float(np.percentile(arr, 99)), | |
| "mean_ms": float(np.mean(arr)), | |
| } | |
| def _one_call_ms(session: ort.InferenceSession, input_ids: np.ndarray) -> float: | |
| attn = np.ones_like(input_ids) | |
| t0 = time.perf_counter() | |
| session.run( | |
| None, | |
| {"input_ids": input_ids, "attention_mask": attn}, | |
| ) | |
| return (time.perf_counter() - t0) * 1000.0 | |
| def benchmark_latency( | |
| onnx_path: Path, | |
| tokenizer, | |
| warmup: int = 50, | |
| measure: int = 500, | |
| intra_op_threads: int = 4, | |
| lengths: tuple = (16, 32, 64, 128, 256, 512), | |
| ) -> dict: | |
| session = _session(onnx_path, intra_op_threads) | |
| results = {} | |
| pad = tokenizer.pad_token_id or 0 | |
| for length in lengths: | |
| # synthesize a sequence of `length` tokens (alternating pad and a fixed | |
| # ascii token id so the model has structure to attend to). | |
| ids = np.full((1, length), pad, dtype=np.int64) | |
| ids[0, ::2] = tokenizer.encode("the")[0] | |
| for _ in range(warmup): | |
| _one_call_ms(session, ids) | |
| samples = [_one_call_ms(session, ids) for _ in range(measure)] | |
| results[str(length)] = _percentiles(samples) | |
| return results | |
| def benchmark_realistic( | |
| onnx_path: Path, | |
| tokenizer, | |
| texts: list[str], | |
| intra_op_threads: int = 4, | |
| warmup: int = 20, | |
| ) -> dict: | |
| session = _session(onnx_path, intra_op_threads) | |
| sequences = [ | |
| np.asarray(tokenizer.encode(t, max_length=512, truncation=True), dtype=np.int64).reshape(1, -1) | |
| for t in texts | |
| ] | |
| # warmup uses the first n inputs | |
| for i in range(min(warmup, len(sequences))): | |
| _one_call_ms(session, sequences[i]) | |
| samples = [_one_call_ms(session, s) for s in sequences] | |
| lengths = [s.shape[1] for s in sequences] | |
| out = _percentiles(samples) | |
| out["samples"] = len(samples) | |
| out["token_length_p50"] = int(statistics.median(lengths)) | |
| out["token_length_p95"] = int(np.percentile(lengths, 95)) | |
| return out | |