Spaces:
Sleeping
Sleeping
File size: 10,553 Bytes
1b032e2 9edd4a6 1cf2791 9edd4a6 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 98e416f fb7332c 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 fad54ba 1b032e2 a658145 fad54ba 1b032e2 a658145 1b032e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | # app.py
# Hugging Face Space: Dog breed classifier with AKC data join
# a. Loads a vision classifier (image -> breed label)
# b. Precomputes a robust mapping from model labels (dogmodelbreedlist.json)
# to AKC display names (akc-data-latest.csv), including variant-flip
# ("Standard Poodle" vs. "Poodle (Standard)") and a small alias table.
# c. Uses the mapping at inference time so results are fast and consistent.
import os
import re
import json
import traceback
from typing import List, Dict, Tuple, Optional
import gradio as gr
import pandas as pd
from PIL import Image
from difflib import get_close_matches
from unicodedata import normalize as _ud_norm
from transformers import pipeline
# -----------------------Configuration--------------
MODEL_ID = os.getenv("MODEL_ID", "valentinocc/dog-breed-classifier") # image-classification model
DOG_LABELS_PATH = os.getenv("DOG_LABELS_PATH", "dogmodelbreedlist.json")
AKC_CSV_PATH = os.getenv("AKC_CSV_PATH", "akc-data-latest.csv")
TOP_K = int(os.getenv("TOP_K", "5"))
# ----------------1) AKC CSV load + breed indexing----------------
def _choose_akc_breed_col(df: pd.DataFrame) -> str:
"""
pick the AKC breed column.
Prefer columns containing 'breed', else a 'name'ish column, else first object column.
"""
cols = list(df.columns)
lower = [c.lower() for c in cols]
# strong preferred
for c in cols:
if "breed" in c.lower():
return c
# fallback
for c in cols:
cl = c.lower()
if cl in {"name", "breed_name", "title", "akc_breed"} or "name" in cl:
return c
# last resort: first likely string column
for c in cols:
if pd.api.types.is_object_dtype(df[c]):
return c
# absolute fallback
return cols[0]
def _canonical_norm(s: str) -> str:
"""
Strong key normalizer: strip accents, lowercase, collapse punctuation/spaces.
"""
s = _ud_norm("NFKD", str(s)).encode("ascii", "ignore").decode("ascii")
s = s.lower().strip()
s = re.sub(r"[’'`]", "", s)
s = re.sub(r"[-–—_/]", " ", s)
s = re.sub(r"[()]", " ", s)
s = re.sub(r"[^a-z0-9& ]+", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def _load_akc_table(path: str) -> Tuple[pd.DataFrame, Dict[str, int], Dict[str, str]]:
"""
Load AKC CSV and return:
- DataFrame
- name->row_index map using normalized keys
- norm_key->display_name map including both "Base (Variant)" and "Variant Base"
"""
df = pd.read_csv(path)
breed_col = _choose_akc_breed_col(df)
df = df.copy()
df.rename(columns={breed_col: "breed"}, inplace=True)
# Build direct and "variant flipped" lookup keys
akc_display_by_norm: Dict[str, str] = {}
akc_name_to_idx: Dict[str, int] = {}
for i, name in enumerate(df["breed"].astype(str).tolist()):
n = _canonical_norm(name)
akc_display_by_norm[n] = name
akc_name_to_idx[n] = i
# flip "Poodle (Standard)" -> "standard poodle"
m = re.match(r"^(.*)\s\(([^)]+)\)$", name.strip())
if m:
base, var = m.group(1), m.group(2)
flip = _canonical_norm(f"{var} {base}")
akc_display_by_norm.setdefault(flip, name)
akc_name_to_idx.setdefault(flip, i)
return df, akc_name_to_idx, akc_display_by_norm
akc_df, akc_name_to_idx, akc_display_by_norm = _load_akc_table(AKC_CSV_PATH)
# -------------2) Model label list + precomputed mapping to increase speed---------------
def _read_model_labels(path: str) -> List[str]:
with open(path, "r") as f:
j = json.load(f)
if isinstance(j, dict) and "id2label" in j:
return list(j["id2label"].values())
if isinstance(j, dict) and "labels" in j:
return j["labels"]
if isinstance(j, list):
return j
raise ValueError("dogmodelbreedlist.json must be a list or have id2label/labels")
MODEL_LABELS: List[str] = _read_model_labels(DOG_LABELS_PATH)
# Account for common size/variety tokens used in AKC naming
SIZE_VARIANTS = {
"toy", "miniature", "standard", "giant", "medium", "small", "large",
"smooth", "wire", "longhaired", "shorthaired", "wirehaired"
}
# Focused alias list for known troublemakers
ALIAS_DIRECT: Dict[str, str] = {
"eskimo dog": "American Eskimo Dog",
"wire haired fox terrier": "Fox Terrier (Wire)",
"smooth fox terrier": "Fox Terrier (Smooth)",
"black and tan coonhound": "Black and Tan Coonhound",
"german short haired pointer": "German Shorthaired Pointer",
"german long haired pointer": "German Longhaired Pointer",
"curly coated retriever": "Curly-Coated Retriever",
"flat coated retriever": "Flat-Coated Retriever",
"yorkshire terrier": "Yorkshire Terrier",
"welsh springer spaniel": "Welsh Springer Spaniel",
"english springer": "English Springer Spaniel",
"standard poodle": "Poodle (Standard)",
"miniature poodle": "Poodle (Miniature)",
"toy poodle": "Poodle (Toy)",
"bluetick": "Bluetick Coonhound",
"walker Hound": "Treeing Walker Coonhound",
"clumber": "Clumber Spaniel",
"wire haired fox terrier": "Wire Fox Terrier"
}
def _precompute_model_to_akc_map(
model_labels: List[str],
akc_display_by_norm: Dict[str, str]
) -> Tuple[Dict[str, str], List[str]]:
"""
Build a one-to-one map: raw model label -> AKC display name.
Returns (mapping, unmapped_list)
"""
model2akc: Dict[str, str] = {}
unmapped: List[str] = []
for raw in model_labels:
norm = _canonical_norm(raw)
# 1) direct
if norm in akc_display_by_norm:
model2akc[raw] = akc_display_by_norm[norm]
continue
# 2) alias
alias = ALIAS_DIRECT.get(norm)
if alias:
alias_norm = _canonical_norm(alias)
if alias_norm in akc_display_by_norm:
model2akc[raw] = akc_display_by_norm[alias_norm]
continue
# 3) safe variant flip ("toy poodle" -> "Poodle (Toy)")
parts = norm.split(" ", 1)
if len(parts) == 2 and parts[0] in SIZE_VARIANTS:
flipped_display = f"{parts[1].title()} ({parts[0].title()})"
f_norm = _canonical_norm(flipped_display)
if f_norm in akc_display_by_norm:
model2akc[raw] = akc_display_by_norm[f_norm]
continue
# 4) strip trailing generic tokens and try again
stripped_set = {
norm,
re.sub(r"\bdog\b$", "", norm).strip(),
re.sub(r"\bterrier\b$", "", norm).strip(),
re.sub(r"\bhound\b$", "", norm).strip(),
}
hit = next((akc_display_by_norm[k] for k in stripped_set if k in akc_display_by_norm), None)
if hit:
model2akc[raw] = hit
continue
# 5) fuzzy (final resort; tight cutoff)
keys = list(akc_display_by_norm.keys())
cand = get_close_matches(norm, keys, n=1, cutoff=0.87)
if cand:
model2akc[raw] = akc_display_by_norm[cand[0]]
else:
unmapped.append(raw)
return model2akc, unmapped
MODEL2AKC_MAP, _UNMAPPED = _precompute_model_to_akc_map(MODEL_LABELS, akc_display_by_norm)
if _UNMAPPED:
print(f"[DogBreedID] Unmapped model labels ({len(_UNMAPPED)}): {sorted(set(_UNMAPPED))}")
#------------------- 3) Load inference pipeline----------------------------
clf = pipeline(
task="image-classification",
model=MODEL_ID
)
# ------------------- 4) UI / inference helpers ---------------------------
def _row_markdown(row: pd.Series) -> str:
# Render AKC row as markdown
parts = []
for col in row.index:
if col == "breed":
continue
val = row[col]
if pd.isna(val):
continue
text = str(val).strip()
if not text:
continue
parts.append(f"**{col.replace('_', ' ').title()}:** {text}")
return "\n\n".join(parts) if parts else "_No extra AKC info available._"
def _lookup_row_by_display_name(akc_display: str) -> Optional[pd.Series]:
key = _canonical_norm(akc_display)
idx = akc_name_to_idx.get(key)
if idx is None:
return None
try:
return akc_df.iloc[idx]
except Exception:
return None
def predict(image: Image.Image) -> str:
try:
preds = clf(image, top_k=TOP_K)
except Exception:
traceback.print_exc()
return "Inference error. Check model/requirements."
# Build table of predictions, mapped names, and AKC info for top-1
if not preds:
return "No predictions."
lines = ["# Predictions"]
# Top-1 detailed info
top = preds[0]
raw_label = top.get("label", "Unknown")
score = float(top.get("score", 0.0))
akc_display = MODEL2AKC_MAP.get(raw_label)
header = f"**Model:** {raw_label} | **Confidence:** {score:.2%}"
if akc_display:
header += f"\n\n**AKC Match:** {akc_display}"
row = _lookup_row_by_display_name(akc_display)
if row is not None:
lines.append(header)
lines.append("\n" + _row_markdown(row))
else:
lines.append(header + "\n\n_AKC row not found._")
else:
lines.append(header + "\n\n_No AKC match found (check alias rules)._")
# Top-K summary table
lines.append("\n---\n")
lines.append("### Top Matches")
lines.append("| Rank | Model Label | Confidence | AKC Match |")
lines.append("|---:|---|---:|---|")
for i, p in enumerate(preds, start=1):
lbl = p.get("label", "Unknown")
sc = float(p.get("score", 0.0))
akc_match = MODEL2AKC_MAP.get(lbl, "—")
lines.append(f"| {i} | {lbl} | {sc:.2%} | {akc_match} |")
return "\n".join(lines)
# -----------
# 5) Gradio UI
# -----------
CSS = """
#app {max-width: 980px; margin: auto;}
"""
with gr.Blocks(css=CSS, fill_height=True) as demo:
gr.Markdown("# Dog Breed ID + AKC Info")
gr.Markdown(
f"Upload an image of a dog. The app predicts the breed using '{MODEL_ID}' "
"and shows breed details from the American Kennel Club dataset. Dataset: https://github.com/tmfilho/akcdata/blob/master/data/akc-data-latest.csv"
)
with gr.Row():
with gr.Column(scale=1):
inp = gr.Image(label="Dog image", type="pil")
btn = gr.Button("Predict", variant="primary")
with gr.Column(scale=1):
out = gr.Markdown()
btn.click(fn=predict, inputs=inp, outputs=out)
if __name__ == "__main__":
demo.launch()
|