Spaces:
Running on Zero
Running on Zero
File size: 2,943 Bytes
f44aac9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | from __future__ import annotations
from dataclasses import dataclass
from difflib import SequenceMatcher
import re
@dataclass(frozen=True)
class Correction:
original: str
canonical: str
confidence: float
def to_dict(self) -> dict:
return {
"original": self.original,
"canonical": self.canonical,
"confidence": round(self.confidence, 3),
}
ALIASES: dict[str, tuple[str, ...]] = {
"Nemotron": ("nemotron", "nemo tron", "neutron", "nemotran", "nemo-tron"),
"MiniCPM5": ("minicpm5", "mini cpm5", "mini cpm", "open cpm", "opencpm5", "cpm five"),
"EmbeddingGemma": ("embedding gemma", "embeddinggemma", "gemma embedding", "embedded gemma"),
"ZeroGPU": ("zero gpu", "zerogpu", "zero-gpu", "zero g p u"),
"Gradio Server": ("gradio server", "gradio.server", "server mode"),
"Build Small Hackathon": ("build small", "build-small", "small hackathon"),
"Off the Grid": ("off the grid", "off-grid", "offline badge"),
"Well-Tuned": ("well tuned", "well-tuned", "fine tune", "finetune", "lora"),
"Tiny Titan": ("tiny titan", "tiny tight end", "tiny-titan"),
"Llama Champion": ("llama champion", "llama.cpp", "llama cpp", "llama badge"),
}
_TOKEN_RE = re.compile(r"[a-z0-9]+(?:[.-][a-z0-9]+)?", re.IGNORECASE)
def normalize_text(text: str) -> tuple[str, list[Correction]]:
normalized = text
corrections: list[Correction] = []
spans = _candidate_spans(text)
used: set[str] = set()
for canonical, aliases in ALIASES.items():
best: tuple[str, float] | None = None
for alias in aliases:
for span in spans:
confidence = _similarity(alias, span)
if confidence >= 0.88 and (best is None or confidence > best[1]):
best = (span, confidence)
if not best:
continue
original, confidence = best
if original.lower() in used or original == canonical:
continue
used.add(original.lower())
normalized = re.sub(re.escape(original), canonical, normalized, count=1, flags=re.IGNORECASE)
corrections.append(Correction(original=original, canonical=canonical, confidence=confidence))
return normalized, corrections
def _candidate_spans(text: str) -> list[str]:
tokens = _TOKEN_RE.findall(text.lower())
spans = set(tokens)
for size in (2, 3):
for index in range(max(0, len(tokens) - size + 1)):
spans.add(" ".join(tokens[index : index + size]))
return sorted(spans, key=len, reverse=True)
def _similarity(left: str, right: str) -> float:
compact_left = re.sub(r"[^a-z0-9]", "", left.lower())
compact_right = re.sub(r"[^a-z0-9]", "", right.lower())
if not compact_left or not compact_right:
return 0.0
if compact_left == compact_right:
return 1.0
return SequenceMatcher(None, compact_left, compact_right).ratio()
|