Spaces:
Running on Zero
Running on Zero
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from difflib import SequenceMatcher | |
| import re | |
| class Correction: | |
| original: str | |
| canonical: str | |
| confidence: float | |
| def to_dict(self) -> dict: | |
| return { | |
| "original": self.original, | |
| "canonical": self.canonical, | |
| "confidence": round(self.confidence, 3), | |
| } | |
| ALIASES: dict[str, tuple[str, ...]] = { | |
| "Nemotron": ("nemotron", "nemo tron", "neutron", "nemotran", "nemo-tron"), | |
| "MiniCPM5": ("minicpm5", "mini cpm5", "mini cpm", "open cpm", "opencpm5", "cpm five"), | |
| "EmbeddingGemma": ("embedding gemma", "embeddinggemma", "gemma embedding", "embedded gemma"), | |
| "ZeroGPU": ("zero gpu", "zerogpu", "zero-gpu", "zero g p u"), | |
| "Gradio Server": ("gradio server", "gradio.server", "server mode"), | |
| "Build Small Hackathon": ("build small", "build-small", "small hackathon"), | |
| "Off the Grid": ("off the grid", "off-grid", "offline badge"), | |
| "Well-Tuned": ("well tuned", "well-tuned", "fine tune", "finetune", "lora"), | |
| "Tiny Titan": ("tiny titan", "tiny tight end", "tiny-titan"), | |
| "Llama Champion": ("llama champion", "llama.cpp", "llama cpp", "llama badge"), | |
| } | |
| _TOKEN_RE = re.compile(r"[a-z0-9]+(?:[.-][a-z0-9]+)?", re.IGNORECASE) | |
| def normalize_text(text: str) -> tuple[str, list[Correction]]: | |
| normalized = text | |
| corrections: list[Correction] = [] | |
| spans = _candidate_spans(text) | |
| used: set[str] = set() | |
| for canonical, aliases in ALIASES.items(): | |
| best: tuple[str, float] | None = None | |
| for alias in aliases: | |
| for span in spans: | |
| confidence = _similarity(alias, span) | |
| if confidence >= 0.88 and (best is None or confidence > best[1]): | |
| best = (span, confidence) | |
| if not best: | |
| continue | |
| original, confidence = best | |
| if original.lower() in used or original == canonical: | |
| continue | |
| used.add(original.lower()) | |
| normalized = re.sub(re.escape(original), canonical, normalized, count=1, flags=re.IGNORECASE) | |
| corrections.append(Correction(original=original, canonical=canonical, confidence=confidence)) | |
| return normalized, corrections | |
| def _candidate_spans(text: str) -> list[str]: | |
| tokens = _TOKEN_RE.findall(text.lower()) | |
| spans = set(tokens) | |
| for size in (2, 3): | |
| for index in range(max(0, len(tokens) - size + 1)): | |
| spans.add(" ".join(tokens[index : index + size])) | |
| return sorted(spans, key=len, reverse=True) | |
| def _similarity(left: str, right: str) -> float: | |
| compact_left = re.sub(r"[^a-z0-9]", "", left.lower()) | |
| compact_right = re.sub(r"[^a-z0-9]", "", right.lower()) | |
| if not compact_left or not compact_right: | |
| return 0.0 | |
| if compact_left == compact_right: | |
| return 1.0 | |
| return SequenceMatcher(None, compact_left, compact_right).ratio() | |