Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

hackathon-advisor / hackathon_advisor /aliases.py

JacobLinCool

fix: align space metadata

f44aac9 verified 3 days ago

raw

history blame contribute delete

2.94 kB

	from __future__ import annotations

	from dataclasses import dataclass
	from difflib import SequenceMatcher
	import re


	@dataclass(frozen=True)
	class Correction:
	original: str
	canonical: str
	confidence: float

	def to_dict(self) -> dict:
	return {
	"original": self.original,
	"canonical": self.canonical,
	"confidence": round(self.confidence, 3),
	}


	ALIASES: dict[str, tuple[str, ...]] = {
	"Nemotron": ("nemotron", "nemo tron", "neutron", "nemotran", "nemo-tron"),
	"MiniCPM5": ("minicpm5", "mini cpm5", "mini cpm", "open cpm", "opencpm5", "cpm five"),
	"EmbeddingGemma": ("embedding gemma", "embeddinggemma", "gemma embedding", "embedded gemma"),
	"ZeroGPU": ("zero gpu", "zerogpu", "zero-gpu", "zero g p u"),
	"Gradio Server": ("gradio server", "gradio.server", "server mode"),
	"Build Small Hackathon": ("build small", "build-small", "small hackathon"),
	"Off the Grid": ("off the grid", "off-grid", "offline badge"),
	"Well-Tuned": ("well tuned", "well-tuned", "fine tune", "finetune", "lora"),
	"Tiny Titan": ("tiny titan", "tiny tight end", "tiny-titan"),
	"Llama Champion": ("llama champion", "llama.cpp", "llama cpp", "llama badge"),
	}

	_TOKEN_RE = re.compile(r"[a-z0-9]+(?:[.-][a-z0-9]+)?", re.IGNORECASE)


	def normalize_text(text: str) -> tuple[str, list[Correction]]:
	normalized = text
	corrections: list[Correction] = []
	spans = _candidate_spans(text)
	used: set[str] = set()

	for canonical, aliases in ALIASES.items():
	best: tuple[str, float] \| None = None
	for alias in aliases:
	for span in spans:
	confidence = _similarity(alias, span)
	if confidence >= 0.88 and (best is None or confidence > best[1]):
	best = (span, confidence)
	if not best:
	continue

	original, confidence = best
	if original.lower() in used or original == canonical:
	continue
	used.add(original.lower())
	normalized = re.sub(re.escape(original), canonical, normalized, count=1, flags=re.IGNORECASE)
	corrections.append(Correction(original=original, canonical=canonical, confidence=confidence))

	return normalized, corrections


	def _candidate_spans(text: str) -> list[str]:
	tokens = _TOKEN_RE.findall(text.lower())
	spans = set(tokens)
	for size in (2, 3):
	for index in range(max(0, len(tokens) - size + 1)):
	spans.add(" ".join(tokens[index : index + size]))
	return sorted(spans, key=len, reverse=True)


	def _similarity(left: str, right: str) -> float:
	compact_left = re.sub(r"[^a-z0-9]", "", left.lower())
	compact_right = re.sub(r"[^a-z0-9]", "", right.lower())
	if not compact_left or not compact_right:
	return 0.0
	if compact_left == compact_right:
	return 1.0
	return SequenceMatcher(None, compact_left, compact_right).ratio()