Text Generation
Transformers
ONNX
Safetensors
English
qwen2
dictation
cleanup
transcript
lora
mumble
conversational
text-generation-inference
Instructions to use adikuma/mumble-cleanup with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use adikuma/mumble-cleanup with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="adikuma/mumble-cleanup") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("adikuma/mumble-cleanup") model = AutoModelForCausalLM.from_pretrained("adikuma/mumble-cleanup") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use adikuma/mumble-cleanup with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "adikuma/mumble-cleanup" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/adikuma/mumble-cleanup
- SGLang
How to use adikuma/mumble-cleanup with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "adikuma/mumble-cleanup" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "adikuma/mumble-cleanup" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use adikuma/mumble-cleanup with Docker Model Runner:
docker model run hf.co/adikuma/mumble-cleanup
| # turn eval.json + training_history.json + latency_benchmark.json into | |
| # charts and a markdown table. mirrors privacy-filter/eval/report.py. | |
| import json | |
| from pathlib import Path | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| def plot_learning_curves(history: list[dict], out_path: Path) -> None: | |
| train_loss = [(h["step"], h["loss"]) for h in history if "loss" in h and "eval_loss" not in h] | |
| eval_loss = [(h["step"], h["eval_loss"]) for h in history if "eval_loss" in h] | |
| fig, ax = plt.subplots(figsize=(8, 4)) | |
| if train_loss: | |
| steps, losses = zip(*train_loss) | |
| ax.plot(steps, losses, label="train", linewidth=1.5) | |
| if eval_loss: | |
| steps, losses = zip(*eval_loss) | |
| ax.plot(steps, losses, label="val", linewidth=2.0) | |
| ax.set_xlabel("step") | |
| ax.set_ylabel("loss") | |
| ax.set_title("learning curves") | |
| ax.legend() | |
| ax.grid(True, alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(out_path, dpi=120) | |
| plt.close(fig) | |
| def plot_metrics_comparison(reports: dict, out_path: Path) -> None: | |
| # reports: {"raw": agg, "base": agg, "fine_tuned": agg} | |
| rows = ["raw", "base", "fine_tuned"] | |
| metrics = ["disfluency_removal_rate", "punctuation_f1", "faithfulness_mean", "pass_rate"] | |
| labels = ["disfluency removed", "punct f1", "faithfulness", "pass rate"] | |
| fig, ax = plt.subplots(figsize=(9, 4.5)) | |
| x = list(range(len(metrics))) | |
| bar_w = 0.27 | |
| for i, row in enumerate(rows): | |
| values = [] | |
| for m in metrics: | |
| v = reports[row].get(m) | |
| values.append(0.0 if v is None else v) | |
| ax.bar([xi + i * bar_w for xi in x], values, width=bar_w, label=row) | |
| ax.set_xticks([xi + bar_w for xi in x]) | |
| ax.set_xticklabels(labels) | |
| ax.set_ylim(0, 1.05) | |
| ax.set_title("quality metrics: raw vs base vs fine-tuned") | |
| ax.legend() | |
| ax.grid(True, alpha=0.3, axis="y") | |
| fig.tight_layout() | |
| fig.savefig(out_path, dpi=120) | |
| plt.close(fig) | |
| def plot_latency(sweep_fp32: dict, sweep_int8: dict | None, out_path: Path) -> None: | |
| fig, ax = plt.subplots(figsize=(8, 4)) | |
| for name, sweep in [("fp32", sweep_fp32), ("int8", sweep_int8)]: | |
| if not sweep: | |
| continue | |
| lengths = sorted(int(k) for k in sweep.keys()) | |
| p50 = [sweep[str(l)]["p50_ms"] for l in lengths] | |
| p95 = [sweep[str(l)]["p95_ms"] for l in lengths] | |
| ax.plot(lengths, p50, marker="o", label=f"{name} p50") | |
| ax.plot(lengths, p95, marker="o", linestyle="--", label=f"{name} p95") | |
| ax.set_xlabel("input length (tokens)") | |
| ax.set_ylabel("latency (ms)") | |
| ax.set_title("cpu latency by input length") | |
| ax.legend() | |
| ax.grid(True, alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(out_path, dpi=120) | |
| plt.close(fig) | |
| def write_report_markdown( | |
| run_dir: Path, | |
| eval_path: Path, | |
| history_path: Path | None, | |
| latency_path: Path | None, | |
| out_path: Path, | |
| ) -> None: | |
| eval_data = json.loads(eval_path.read_text()) | |
| lines: list[str] = [] | |
| lines.append("# cleanup model report") | |
| lines.append("") | |
| lines.append("## quality metrics") | |
| lines.append("") | |
| lines.append("| model | disfluency removal | punct f1 | faithfulness | length ratio | pass rate |") | |
| lines.append("|---|---:|---:|---:|---:|---:|") | |
| rows = ["raw", "base", "fine_tuned"] | |
| for row in rows: | |
| m = eval_data.get(row) | |
| if not m: | |
| continue | |
| d = m.get("disfluency_removal_rate") | |
| d_str = "n/a" if d is None else f"{d:.3f}" | |
| lines.append( | |
| f"| {row} | {d_str} | {m['punctuation_f1']:.3f} | " | |
| f"{m['faithfulness_mean']:.3f} | {m['length_ratio_mean']:.3f} | " | |
| f"{m['pass_rate']:.3f} |" | |
| ) | |
| lines.append("") | |
| if latency_path and latency_path.exists(): | |
| lat = json.loads(latency_path.read_text()) | |
| lines.append("## cpu latency") | |
| lines.append("") | |
| lines.append("| length | p50 ms | p95 ms | p99 ms | mean ms |") | |
| lines.append("|---:|---:|---:|---:|---:|") | |
| for length, stats in lat.get("results_by_length", {}).items(): | |
| lines.append( | |
| f"| {length} | {stats['p50_ms']:.1f} | {stats['p95_ms']:.1f} | " | |
| f"{stats['p99_ms']:.1f} | {stats['mean_ms']:.1f} |" | |
| ) | |
| if "realistic_mix" in lat and lat["realistic_mix"]: | |
| r = lat["realistic_mix"] | |
| lines.append("") | |
| lines.append( | |
| f"**realistic mix** ({r['samples']} real inputs, " | |
| f"p50 token length {r['token_length_p50']}): " | |
| f"p50={r['p50_ms']:.1f}ms p95={r['p95_ms']:.1f}ms p99={r['p99_ms']:.1f}ms" | |
| ) | |
| Path(out_path).write_text("\n".join(lines)) | |