File size: 2,943 Bytes
f44aac9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from __future__ import annotations

from dataclasses import dataclass
from difflib import SequenceMatcher
import re


@dataclass(frozen=True)
class Correction:
    original: str
    canonical: str
    confidence: float

    def to_dict(self) -> dict:
        return {
            "original": self.original,
            "canonical": self.canonical,
            "confidence": round(self.confidence, 3),
        }


ALIASES: dict[str, tuple[str, ...]] = {
    "Nemotron": ("nemotron", "nemo tron", "neutron", "nemotran", "nemo-tron"),
    "MiniCPM5": ("minicpm5", "mini cpm5", "mini cpm", "open cpm", "opencpm5", "cpm five"),
    "EmbeddingGemma": ("embedding gemma", "embeddinggemma", "gemma embedding", "embedded gemma"),
    "ZeroGPU": ("zero gpu", "zerogpu", "zero-gpu", "zero g p u"),
    "Gradio Server": ("gradio server", "gradio.server", "server mode"),
    "Build Small Hackathon": ("build small", "build-small", "small hackathon"),
    "Off the Grid": ("off the grid", "off-grid", "offline badge"),
    "Well-Tuned": ("well tuned", "well-tuned", "fine tune", "finetune", "lora"),
    "Tiny Titan": ("tiny titan", "tiny tight end", "tiny-titan"),
    "Llama Champion": ("llama champion", "llama.cpp", "llama cpp", "llama badge"),
}

_TOKEN_RE = re.compile(r"[a-z0-9]+(?:[.-][a-z0-9]+)?", re.IGNORECASE)


def normalize_text(text: str) -> tuple[str, list[Correction]]:
    normalized = text
    corrections: list[Correction] = []
    spans = _candidate_spans(text)
    used: set[str] = set()

    for canonical, aliases in ALIASES.items():
        best: tuple[str, float] | None = None
        for alias in aliases:
            for span in spans:
                confidence = _similarity(alias, span)
                if confidence >= 0.88 and (best is None or confidence > best[1]):
                    best = (span, confidence)
        if not best:
            continue

        original, confidence = best
        if original.lower() in used or original == canonical:
            continue
        used.add(original.lower())
        normalized = re.sub(re.escape(original), canonical, normalized, count=1, flags=re.IGNORECASE)
        corrections.append(Correction(original=original, canonical=canonical, confidence=confidence))

    return normalized, corrections


def _candidate_spans(text: str) -> list[str]:
    tokens = _TOKEN_RE.findall(text.lower())
    spans = set(tokens)
    for size in (2, 3):
        for index in range(max(0, len(tokens) - size + 1)):
            spans.add(" ".join(tokens[index : index + size]))
    return sorted(spans, key=len, reverse=True)


def _similarity(left: str, right: str) -> float:
    compact_left = re.sub(r"[^a-z0-9]", "", left.lower())
    compact_right = re.sub(r"[^a-z0-9]", "", right.lower())
    if not compact_left or not compact_right:
        return 0.0
    if compact_left == compact_right:
        return 1.0
    return SequenceMatcher(None, compact_left, compact_right).ratio()