Text Generation
Transformers
ONNX
Safetensors
English
qwen2
dictation
cleanup
transcript
lora
mumble
conversational
text-generation-inference
Instructions to use adikuma/mumble-cleanup with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use adikuma/mumble-cleanup with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="adikuma/mumble-cleanup") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("adikuma/mumble-cleanup") model = AutoModelForCausalLM.from_pretrained("adikuma/mumble-cleanup") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use adikuma/mumble-cleanup with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "adikuma/mumble-cleanup" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/adikuma/mumble-cleanup
- SGLang
How to use adikuma/mumble-cleanup with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "adikuma/mumble-cleanup" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "adikuma/mumble-cleanup" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "adikuma/mumble-cleanup", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use adikuma/mumble-cleanup with Docker Model Runner:
docker model run hf.co/adikuma/mumble-cleanup
| # explore the synthetic seed dataset and produce docs/data_report.md plus a | |
| # set of supporting charts under docs/data_images/. mirrors the privacy-filter | |
| # explore_data.py pattern. cpu only, no model needed. | |
| # | |
| # usage: uv run python scripts/explore_data.py | |
| import json | |
| import re | |
| from collections import Counter | |
| from pathlib import Path | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| SEED_PATH = Path("data/seed/synthetic_pairs.jsonl") | |
| OUT_DOC = Path("docs/data_report.md") | |
| OUT_IMAGES = Path("docs/data_images") | |
| FILLERS = {"um", "uh", "er", "ah", "like", "you know", "i mean", "so", "well"} | |
| def load_rows() -> list[dict]: | |
| rows = [] | |
| with open(SEED_PATH, "r", encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| rows.append(json.loads(line)) | |
| return rows | |
| def word_count(s: str) -> int: | |
| return len(s.split()) | |
| def filler_count(text: str) -> int: | |
| lower = " " + text.lower() + " " | |
| return sum(len(re.findall(rf"(?<!\w){re.escape(f)}(?!\w)", lower)) for f in FILLERS) | |
| def content_words(text: str) -> set: | |
| # lowercase, drop punctuation, return a set of content tokens. | |
| stripped = re.sub(r"[^\w\s']", " ", text.lower()) | |
| return set(stripped.split()) | |
| def faithfulness(raw: str, clean: str) -> float: | |
| # what fraction of clean's content words also appear in raw. measures the | |
| # by-construction faithfulness of the dataset. a low value means the clean | |
| # invented vocabulary that was not in raw, which would teach the model to | |
| # hallucinate. | |
| rw = content_words(raw) | |
| cw = content_words(clean) | |
| if not cw: | |
| return 1.0 | |
| return len(cw & rw) / len(cw) | |
| def plot_category_counts(rows: list[dict]): | |
| counts = Counter(r["category"] for r in rows) | |
| order = sorted(counts, key=counts.get, reverse=True) | |
| values = [counts[c] for c in order] | |
| fig, ax = plt.subplots(figsize=(8, 4.2)) | |
| bars = ax.bar(order, values, color="#5b8cb8") | |
| ax.set_title("pairs per category") | |
| ax.set_ylabel("count") | |
| ax.set_xticklabels(order, rotation=22, ha="right") | |
| for b, v in zip(bars, values): | |
| ax.text(b.get_x() + b.get_width() / 2, v + 1, str(v), ha="center", fontsize=9) | |
| ax.grid(True, axis="y", alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(OUT_IMAGES / "category_counts.png", dpi=130) | |
| plt.close(fig) | |
| def plot_length_distribution(rows: list[dict]): | |
| cats = sorted({r["category"] for r in rows}) | |
| fig, ax = plt.subplots(figsize=(9, 4.8)) | |
| data_raw = [[word_count(r["raw"]) for r in rows if r["category"] == c] for c in cats] | |
| parts = ax.violinplot(data_raw, showmeans=False, showmedians=True) | |
| for pc in parts["bodies"]: | |
| pc.set_facecolor("#5b8cb8") | |
| pc.set_alpha(0.65) | |
| ax.set_xticks(range(1, len(cats) + 1)) | |
| ax.set_xticklabels(cats, rotation=22, ha="right") | |
| ax.set_ylabel("raw side word count") | |
| ax.set_title("input length distribution by category") | |
| ax.grid(True, axis="y", alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(OUT_IMAGES / "length_distribution.png", dpi=130) | |
| plt.close(fig) | |
| def plot_raw_vs_clean_length(rows: list[dict]): | |
| raw_lens = [word_count(r["raw"]) for r in rows] | |
| clean_lens = [word_count(r["clean"]) for r in rows] | |
| fig, ax = plt.subplots(figsize=(6.5, 6)) | |
| ax.scatter(raw_lens, clean_lens, alpha=0.32, s=14, color="#5b8cb8") | |
| lo = 0 | |
| hi = max(max(raw_lens), max(clean_lens)) | |
| ax.plot([lo, hi], [lo, hi], "k--", alpha=0.4, linewidth=1) | |
| ax.set_xlabel("raw word count") | |
| ax.set_ylabel("clean word count") | |
| ax.set_title("raw vs clean length (clean below diagonal is expected)") | |
| ax.grid(True, alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(OUT_IMAGES / "raw_vs_clean_length.png", dpi=130) | |
| plt.close(fig) | |
| def plot_filler_intensity(rows: list[dict]): | |
| cats = sorted({r["category"] for r in rows}) | |
| means = [ | |
| np.mean([filler_count(r["raw"]) for r in rows if r["category"] == c]) for c in cats | |
| ] | |
| fig, ax = plt.subplots(figsize=(8, 4.2)) | |
| bars = ax.bar(cats, means, color="#c08a55") | |
| ax.set_title("average filler count per raw input by category") | |
| ax.set_ylabel("avg fillers per pair") | |
| ax.set_xticklabels(cats, rotation=22, ha="right") | |
| for b, v in zip(bars, means): | |
| ax.text(b.get_x() + b.get_width() / 2, v + 0.05, f"{v:.1f}", ha="center", fontsize=9) | |
| ax.grid(True, axis="y", alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(OUT_IMAGES / "filler_intensity.png", dpi=130) | |
| plt.close(fig) | |
| def plot_top_fillers(rows: list[dict]): | |
| counts: Counter = Counter() | |
| for r in rows: | |
| lower = " " + r["raw"].lower() + " " | |
| for f in FILLERS: | |
| counts[f] += len(re.findall(rf"(?<!\w){re.escape(f)}(?!\w)", lower)) | |
| items = counts.most_common() | |
| labels, values = zip(*items) | |
| fig, ax = plt.subplots(figsize=(7, 4.2)) | |
| ax.barh(labels[::-1], values[::-1], color="#5b8cb8") | |
| ax.set_title("top fillers across all raw inputs") | |
| ax.set_xlabel("total occurrences in raw") | |
| ax.grid(True, axis="x", alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(OUT_IMAGES / "top_fillers.png", dpi=130) | |
| plt.close(fig) | |
| def plot_faithfulness(rows: list[dict]): | |
| vals = [faithfulness(r["raw"], r["clean"]) for r in rows] | |
| fig, ax = plt.subplots(figsize=(7, 4.2)) | |
| ax.hist(vals, bins=24, color="#5b8cb8", edgecolor="white") | |
| ax.axvline(0.95, color="red", linestyle="--", linewidth=1, label="0.95 threshold") | |
| ax.set_title("faithfulness: fraction of clean content words present in raw") | |
| ax.set_xlabel("faithfulness score (1.0 = perfect)") | |
| ax.set_ylabel("number of pairs") | |
| ax.legend() | |
| ax.grid(True, axis="y", alpha=0.3) | |
| fig.tight_layout() | |
| fig.savefig(OUT_IMAGES / "faithfulness.png", dpi=130) | |
| plt.close(fig) | |
| def write_report(rows: list[dict], stats: dict): | |
| lines: list[str] = [] | |
| lines.append("# data report") | |
| lines.append("") | |
| lines.append("The synthetic seed dataset that backs the Mumble cleanup model. Built by a multi-agent workflow that spawned 8 specialist agents in parallel and produced 612 pairs across 8 dictation categories; a polish pass added 76 more `long_form_thoughts` pairs with strictly diverse openers, bringing the total to **688 pairs**.") | |
| lines.append("") | |
| lines.append("Every pair is `{ raw: <Parakeet-shaped lowercase no-punct disfluent input>, clean: <proper English output> }`. The clean side is faithful by construction: every content word in `clean` exists in `raw` (modulo standard homophone fixes, contractions, and casing). This is what stops the model from learning to hallucinate.") | |
| lines.append("") | |
| lines.append("## category mix") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("") | |
| cats = sorted({r["category"] for r in rows}) | |
| cat_counts = Counter(r["category"] for r in rows) | |
| lines.append("| category | count |") | |
| lines.append("|---|---:|") | |
| for c in cats: | |
| lines.append(f"| `{c}` | {cat_counts[c]} |") | |
| lines.append(f"| **total** | **{len(rows)}** |") | |
| lines.append("") | |
| lines.append("`long_form_thoughts` is intentionally over-weighted because paragraph-length cleanup is the hardest behavior (multiple sentence boundaries, sustained context, false starts) and 145 examples gives the model the signal it needs to handle 60-90 word inputs.") | |
| lines.append("") | |
| lines.append("## length distribution") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("") | |
| lines.append(f"Raw inputs span **{stats['raw_min']} to {stats['raw_max']} words** with a median of **{stats['raw_median']:.0f}**. Clean outputs are slightly shorter on average ({stats['clean_median']:.0f} median words) because they have fillers and stutters removed. The categories show meaningfully different length distributions: short utterances dominate `casual_messages`, `questions_and_asks`, and `mixed_content`; long paragraph-shaped inputs dominate `long_form_thoughts`.") | |
| lines.append("") | |
| lines.append("## raw vs clean length") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("Points below the diagonal mean clean is shorter than raw — the model is being trained to remove material, not add it. The cluster sits just below the diagonal, which is the expected shape for a faithful cleanup task: a few words removed per input on average, never more than ~25%.") | |
| lines.append("") | |
| lines.append("## disfluency intensity") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("Average filler-word count per raw input, by category. `meeting_notes` and `long_form_thoughts` carry the heaviest disfluency load (people think out loud during meetings); `mixed_content` and `questions_and_asks` are leanest (those categories are about precision, not verbosity).") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("Distribution of filler words across the entire dataset. `um` and `uh` dominate (matching real Parakeet output), with `like`, `you know`, and `so` following at a moderate rate. The mix matches what shows up in real dictation transcripts.") | |
| lines.append("") | |
| lines.append("## faithfulness check") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("") | |
| lines.append("For each pair, we compute the fraction of content words in the clean side that also appear in the raw side. A perfect value is 1.0 (every clean content word came from raw); lower values indicate the clean introduced content the raw did not have, which would train the model to hallucinate.") | |
| lines.append("") | |
| lines.append(f"- **Mean faithfulness**: {stats['faith_mean']:.3f}") | |
| lines.append(f"- **Median faithfulness**: {stats['faith_median']:.3f}") | |
| lines.append(f"- **Pairs above 0.95 threshold**: {stats['faith_pass']} of {len(rows)} ({100 * stats['faith_pass'] / len(rows):.1f}%)") | |
| lines.append(f"- **Pairs above 0.90 threshold**: {stats['faith_90']} of {len(rows)} ({100 * stats['faith_90'] / len(rows):.1f}%)") | |
| lines.append("") | |
| lines.append("Small drops below 1.0 come from legitimate sources: number-word to digit conversion (\"two thirty\" -> \"2:30\"), proper-noun capitalization that adds new tokens to the content-word set under our simple lowercase comparison (\"acme\" -> \"Acme\" should be counted as matching but our naive check might miss some), and contractions (\"i\" -> \"I'm\" via apostrophe restoration).") | |
| lines.append("") | |
| lines.append("## sample pairs") | |
| lines.append("") | |
| lines.append("Two per category, illustrating the shape of the dataset:") | |
| lines.append("") | |
| for cat in cats: | |
| cat_rows = [r for r in rows if r["category"] == cat] | |
| samples = [cat_rows[0], cat_rows[len(cat_rows) // 2]] if len(cat_rows) >= 2 else cat_rows | |
| lines.append(f"### `{cat}`") | |
| lines.append("") | |
| for s in samples: | |
| lines.append(f"- **raw**: `{s['raw']}`") | |
| lines.append(f"- **clean**: {s['clean']}") | |
| lines.append("") | |
| lines.append("## limitations") | |
| lines.append("") | |
| lines.append("- **Synthetic origin**: every pair was generated by an LLM workflow, not transcribed from real Parakeet output. The disfluency patterns are modeled to match real ASR failure modes but may under-represent edge cases the model will face in production.") | |
| lines.append("- **Size**: 688 pairs is on the lower-middle end of the documented sweet spot for narrow LoRA fine-tunes (200-500 floor, 2k-5k comfortable). Adequate for a v1 ship; if eval pass rate is below 0.85 we regenerate another 600-1000 pairs and retrain.") | |
| lines.append("- **Faithfulness is statistical, not strict**: a few pairs may drop below 0.95 because of legitimate transformations (numeric formatting, proper-noun casing). We don't filter these out because the training task explicitly wants the model to learn those transformations.") | |
| lines.append("- **English only.**") | |
| lines.append("") | |
| OUT_DOC.parent.mkdir(parents=True, exist_ok=True) | |
| OUT_DOC.write_text("\n".join(lines), encoding="utf-8") | |
| def main() -> None: | |
| OUT_IMAGES.mkdir(parents=True, exist_ok=True) | |
| rows = load_rows() | |
| print(f"loaded {len(rows)} pairs from {SEED_PATH}") | |
| plot_category_counts(rows) | |
| plot_length_distribution(rows) | |
| plot_raw_vs_clean_length(rows) | |
| plot_filler_intensity(rows) | |
| plot_top_fillers(rows) | |
| plot_faithfulness(rows) | |
| raw_lens = [word_count(r["raw"]) for r in rows] | |
| clean_lens = [word_count(r["clean"]) for r in rows] | |
| faith = [faithfulness(r["raw"], r["clean"]) for r in rows] | |
| stats = { | |
| "raw_min": min(raw_lens), | |
| "raw_max": max(raw_lens), | |
| "raw_median": float(np.median(raw_lens)), | |
| "clean_min": min(clean_lens), | |
| "clean_max": max(clean_lens), | |
| "clean_median": float(np.median(clean_lens)), | |
| "faith_mean": float(np.mean(faith)), | |
| "faith_median": float(np.median(faith)), | |
| "faith_pass": sum(1 for v in faith if v >= 0.95), | |
| "faith_90": sum(1 for v in faith if v >= 0.90), | |
| } | |
| print(f"stats: {stats}") | |
| write_report(rows, stats) | |
| print(f"wrote {OUT_DOC} and {OUT_IMAGES}/") | |
| if __name__ == "__main__": | |
| main() | |