# app.py # Hugging Face Space: Dog breed classifier with AKC data join # a. Loads a vision classifier (image -> breed label) # b. Precomputes a robust mapping from model labels (dogmodelbreedlist.json) # to AKC display names (akc-data-latest.csv), including variant-flip # ("Standard Poodle" vs. "Poodle (Standard)") and a small alias table. # c. Uses the mapping at inference time so results are fast and consistent. import os import re import json import traceback from typing import List, Dict, Tuple, Optional import gradio as gr import pandas as pd from PIL import Image from difflib import get_close_matches from unicodedata import normalize as _ud_norm from transformers import pipeline # -----------------------Configuration-------------- MODEL_ID = os.getenv("MODEL_ID", "valentinocc/dog-breed-classifier") # image-classification model DOG_LABELS_PATH = os.getenv("DOG_LABELS_PATH", "dogmodelbreedlist.json") AKC_CSV_PATH = os.getenv("AKC_CSV_PATH", "akc-data-latest.csv") TOP_K = int(os.getenv("TOP_K", "5")) # ----------------1) AKC CSV load + breed indexing---------------- def _choose_akc_breed_col(df: pd.DataFrame) -> str: """ pick the AKC breed column. Prefer columns containing 'breed', else a 'name'ish column, else first object column. """ cols = list(df.columns) lower = [c.lower() for c in cols] # strong preferred for c in cols: if "breed" in c.lower(): return c # fallback for c in cols: cl = c.lower() if cl in {"name", "breed_name", "title", "akc_breed"} or "name" in cl: return c # last resort: first likely string column for c in cols: if pd.api.types.is_object_dtype(df[c]): return c # absolute fallback return cols[0] def _canonical_norm(s: str) -> str: """ Strong key normalizer: strip accents, lowercase, collapse punctuation/spaces. """ s = _ud_norm("NFKD", str(s)).encode("ascii", "ignore").decode("ascii") s = s.lower().strip() s = re.sub(r"[’'`]", "", s) s = re.sub(r"[-–—_/]", " ", s) s = re.sub(r"[()]", " ", s) s = re.sub(r"[^a-z0-9& ]+", " ", s) s = re.sub(r"\s+", " ", s).strip() return s def _load_akc_table(path: str) -> Tuple[pd.DataFrame, Dict[str, int], Dict[str, str]]: """ Load AKC CSV and return: - DataFrame - name->row_index map using normalized keys - norm_key->display_name map including both "Base (Variant)" and "Variant Base" """ df = pd.read_csv(path) breed_col = _choose_akc_breed_col(df) df = df.copy() df.rename(columns={breed_col: "breed"}, inplace=True) # Build direct and "variant flipped" lookup keys akc_display_by_norm: Dict[str, str] = {} akc_name_to_idx: Dict[str, int] = {} for i, name in enumerate(df["breed"].astype(str).tolist()): n = _canonical_norm(name) akc_display_by_norm[n] = name akc_name_to_idx[n] = i # flip "Poodle (Standard)" -> "standard poodle" m = re.match(r"^(.*)\s\(([^)]+)\)$", name.strip()) if m: base, var = m.group(1), m.group(2) flip = _canonical_norm(f"{var} {base}") akc_display_by_norm.setdefault(flip, name) akc_name_to_idx.setdefault(flip, i) return df, akc_name_to_idx, akc_display_by_norm akc_df, akc_name_to_idx, akc_display_by_norm = _load_akc_table(AKC_CSV_PATH) # -------------2) Model label list + precomputed mapping to increase speed--------------- def _read_model_labels(path: str) -> List[str]: with open(path, "r") as f: j = json.load(f) if isinstance(j, dict) and "id2label" in j: return list(j["id2label"].values()) if isinstance(j, dict) and "labels" in j: return j["labels"] if isinstance(j, list): return j raise ValueError("dogmodelbreedlist.json must be a list or have id2label/labels") MODEL_LABELS: List[str] = _read_model_labels(DOG_LABELS_PATH) # Account for common size/variety tokens used in AKC naming SIZE_VARIANTS = { "toy", "miniature", "standard", "giant", "medium", "small", "large", "smooth", "wire", "longhaired", "shorthaired", "wirehaired" } # Focused alias list for known troublemakers ALIAS_DIRECT: Dict[str, str] = { "eskimo dog": "American Eskimo Dog", "wire haired fox terrier": "Fox Terrier (Wire)", "smooth fox terrier": "Fox Terrier (Smooth)", "black and tan coonhound": "Black and Tan Coonhound", "german short haired pointer": "German Shorthaired Pointer", "german long haired pointer": "German Longhaired Pointer", "curly coated retriever": "Curly-Coated Retriever", "flat coated retriever": "Flat-Coated Retriever", "yorkshire terrier": "Yorkshire Terrier", "welsh springer spaniel": "Welsh Springer Spaniel", "english springer": "English Springer Spaniel", "standard poodle": "Poodle (Standard)", "miniature poodle": "Poodle (Miniature)", "toy poodle": "Poodle (Toy)", "bluetick": "Bluetick Coonhound", "walker Hound": "Treeing Walker Coonhound", "clumber": "Clumber Spaniel", "wire haired fox terrier": "Wire Fox Terrier" } def _precompute_model_to_akc_map( model_labels: List[str], akc_display_by_norm: Dict[str, str] ) -> Tuple[Dict[str, str], List[str]]: """ Build a one-to-one map: raw model label -> AKC display name. Returns (mapping, unmapped_list) """ model2akc: Dict[str, str] = {} unmapped: List[str] = [] for raw in model_labels: norm = _canonical_norm(raw) # 1) direct if norm in akc_display_by_norm: model2akc[raw] = akc_display_by_norm[norm] continue # 2) alias alias = ALIAS_DIRECT.get(norm) if alias: alias_norm = _canonical_norm(alias) if alias_norm in akc_display_by_norm: model2akc[raw] = akc_display_by_norm[alias_norm] continue # 3) safe variant flip ("toy poodle" -> "Poodle (Toy)") parts = norm.split(" ", 1) if len(parts) == 2 and parts[0] in SIZE_VARIANTS: flipped_display = f"{parts[1].title()} ({parts[0].title()})" f_norm = _canonical_norm(flipped_display) if f_norm in akc_display_by_norm: model2akc[raw] = akc_display_by_norm[f_norm] continue # 4) strip trailing generic tokens and try again stripped_set = { norm, re.sub(r"\bdog\b$", "", norm).strip(), re.sub(r"\bterrier\b$", "", norm).strip(), re.sub(r"\bhound\b$", "", norm).strip(), } hit = next((akc_display_by_norm[k] for k in stripped_set if k in akc_display_by_norm), None) if hit: model2akc[raw] = hit continue # 5) fuzzy (final resort; tight cutoff) keys = list(akc_display_by_norm.keys()) cand = get_close_matches(norm, keys, n=1, cutoff=0.87) if cand: model2akc[raw] = akc_display_by_norm[cand[0]] else: unmapped.append(raw) return model2akc, unmapped MODEL2AKC_MAP, _UNMAPPED = _precompute_model_to_akc_map(MODEL_LABELS, akc_display_by_norm) if _UNMAPPED: print(f"[DogBreedID] Unmapped model labels ({len(_UNMAPPED)}): {sorted(set(_UNMAPPED))}") #------------------- 3) Load inference pipeline---------------------------- clf = pipeline( task="image-classification", model=MODEL_ID ) # ------------------- 4) UI / inference helpers --------------------------- def _row_markdown(row: pd.Series) -> str: # Render AKC row as markdown parts = [] for col in row.index: if col == "breed": continue val = row[col] if pd.isna(val): continue text = str(val).strip() if not text: continue parts.append(f"**{col.replace('_', ' ').title()}:** {text}") return "\n\n".join(parts) if parts else "_No extra AKC info available._" def _lookup_row_by_display_name(akc_display: str) -> Optional[pd.Series]: key = _canonical_norm(akc_display) idx = akc_name_to_idx.get(key) if idx is None: return None try: return akc_df.iloc[idx] except Exception: return None def predict(image: Image.Image) -> str: try: preds = clf(image, top_k=TOP_K) except Exception: traceback.print_exc() return "Inference error. Check model/requirements." # Build table of predictions, mapped names, and AKC info for top-1 if not preds: return "No predictions." lines = ["# Predictions"] # Top-1 detailed info top = preds[0] raw_label = top.get("label", "Unknown") score = float(top.get("score", 0.0)) akc_display = MODEL2AKC_MAP.get(raw_label) header = f"**Model:** {raw_label} | **Confidence:** {score:.2%}" if akc_display: header += f"\n\n**AKC Match:** {akc_display}" row = _lookup_row_by_display_name(akc_display) if row is not None: lines.append(header) lines.append("\n" + _row_markdown(row)) else: lines.append(header + "\n\n_AKC row not found._") else: lines.append(header + "\n\n_No AKC match found (check alias rules)._") # Top-K summary table lines.append("\n---\n") lines.append("### Top Matches") lines.append("| Rank | Model Label | Confidence | AKC Match |") lines.append("|---:|---|---:|---|") for i, p in enumerate(preds, start=1): lbl = p.get("label", "Unknown") sc = float(p.get("score", 0.0)) akc_match = MODEL2AKC_MAP.get(lbl, "—") lines.append(f"| {i} | {lbl} | {sc:.2%} | {akc_match} |") return "\n".join(lines) # ----------- # 5) Gradio UI # ----------- CSS = """ #app {max-width: 980px; margin: auto;} """ with gr.Blocks(css=CSS, fill_height=True) as demo: gr.Markdown("# Dog Breed ID + AKC Info") gr.Markdown( f"Upload an image of a dog. The app predicts the breed using '{MODEL_ID}' " "and shows breed details from the American Kennel Club dataset. Dataset: https://github.com/tmfilho/akcdata/blob/master/data/akc-data-latest.csv" ) with gr.Row(): with gr.Column(scale=1): inp = gr.Image(label="Dog image", type="pil") btn = gr.Button("Predict", variant="primary") with gr.Column(scale=1): out = gr.Markdown() btn.click(fn=predict, inputs=inp, outputs=out) if __name__ == "__main__": demo.launch()