Spaces:
Sleeping
Sleeping
| # app.py | |
| # Hugging Face Space: Dog breed classifier with AKC data join | |
| # a. Loads a vision classifier (image -> breed label) | |
| # b. Precomputes a robust mapping from model labels (dogmodelbreedlist.json) | |
| # to AKC display names (akc-data-latest.csv), including variant-flip | |
| # ("Standard Poodle" vs. "Poodle (Standard)") and a small alias table. | |
| # c. Uses the mapping at inference time so results are fast and consistent. | |
| import os | |
| import re | |
| import json | |
| import traceback | |
| from typing import List, Dict, Tuple, Optional | |
| import gradio as gr | |
| import pandas as pd | |
| from PIL import Image | |
| from difflib import get_close_matches | |
| from unicodedata import normalize as _ud_norm | |
| from transformers import pipeline | |
| # -----------------------Configuration-------------- | |
| MODEL_ID = os.getenv("MODEL_ID", "valentinocc/dog-breed-classifier") # image-classification model | |
| DOG_LABELS_PATH = os.getenv("DOG_LABELS_PATH", "dogmodelbreedlist.json") | |
| AKC_CSV_PATH = os.getenv("AKC_CSV_PATH", "akc-data-latest.csv") | |
| TOP_K = int(os.getenv("TOP_K", "5")) | |
| # ----------------1) AKC CSV load + breed indexing---------------- | |
| def _choose_akc_breed_col(df: pd.DataFrame) -> str: | |
| """ | |
| pick the AKC breed column. | |
| Prefer columns containing 'breed', else a 'name'ish column, else first object column. | |
| """ | |
| cols = list(df.columns) | |
| lower = [c.lower() for c in cols] | |
| # strong preferred | |
| for c in cols: | |
| if "breed" in c.lower(): | |
| return c | |
| # fallback | |
| for c in cols: | |
| cl = c.lower() | |
| if cl in {"name", "breed_name", "title", "akc_breed"} or "name" in cl: | |
| return c | |
| # last resort: first likely string column | |
| for c in cols: | |
| if pd.api.types.is_object_dtype(df[c]): | |
| return c | |
| # absolute fallback | |
| return cols[0] | |
| def _canonical_norm(s: str) -> str: | |
| """ | |
| Strong key normalizer: strip accents, lowercase, collapse punctuation/spaces. | |
| """ | |
| s = _ud_norm("NFKD", str(s)).encode("ascii", "ignore").decode("ascii") | |
| s = s.lower().strip() | |
| s = re.sub(r"[’'`]", "", s) | |
| s = re.sub(r"[-–—_/]", " ", s) | |
| s = re.sub(r"[()]", " ", s) | |
| s = re.sub(r"[^a-z0-9& ]+", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def _load_akc_table(path: str) -> Tuple[pd.DataFrame, Dict[str, int], Dict[str, str]]: | |
| """ | |
| Load AKC CSV and return: | |
| - DataFrame | |
| - name->row_index map using normalized keys | |
| - norm_key->display_name map including both "Base (Variant)" and "Variant Base" | |
| """ | |
| df = pd.read_csv(path) | |
| breed_col = _choose_akc_breed_col(df) | |
| df = df.copy() | |
| df.rename(columns={breed_col: "breed"}, inplace=True) | |
| # Build direct and "variant flipped" lookup keys | |
| akc_display_by_norm: Dict[str, str] = {} | |
| akc_name_to_idx: Dict[str, int] = {} | |
| for i, name in enumerate(df["breed"].astype(str).tolist()): | |
| n = _canonical_norm(name) | |
| akc_display_by_norm[n] = name | |
| akc_name_to_idx[n] = i | |
| # flip "Poodle (Standard)" -> "standard poodle" | |
| m = re.match(r"^(.*)\s\(([^)]+)\)$", name.strip()) | |
| if m: | |
| base, var = m.group(1), m.group(2) | |
| flip = _canonical_norm(f"{var} {base}") | |
| akc_display_by_norm.setdefault(flip, name) | |
| akc_name_to_idx.setdefault(flip, i) | |
| return df, akc_name_to_idx, akc_display_by_norm | |
| akc_df, akc_name_to_idx, akc_display_by_norm = _load_akc_table(AKC_CSV_PATH) | |
| # -------------2) Model label list + precomputed mapping to increase speed--------------- | |
| def _read_model_labels(path: str) -> List[str]: | |
| with open(path, "r") as f: | |
| j = json.load(f) | |
| if isinstance(j, dict) and "id2label" in j: | |
| return list(j["id2label"].values()) | |
| if isinstance(j, dict) and "labels" in j: | |
| return j["labels"] | |
| if isinstance(j, list): | |
| return j | |
| raise ValueError("dogmodelbreedlist.json must be a list or have id2label/labels") | |
| MODEL_LABELS: List[str] = _read_model_labels(DOG_LABELS_PATH) | |
| # Account for common size/variety tokens used in AKC naming | |
| SIZE_VARIANTS = { | |
| "toy", "miniature", "standard", "giant", "medium", "small", "large", | |
| "smooth", "wire", "longhaired", "shorthaired", "wirehaired" | |
| } | |
| # Focused alias list for known troublemakers | |
| ALIAS_DIRECT: Dict[str, str] = { | |
| "eskimo dog": "American Eskimo Dog", | |
| "wire haired fox terrier": "Fox Terrier (Wire)", | |
| "smooth fox terrier": "Fox Terrier (Smooth)", | |
| "black and tan coonhound": "Black and Tan Coonhound", | |
| "german short haired pointer": "German Shorthaired Pointer", | |
| "german long haired pointer": "German Longhaired Pointer", | |
| "curly coated retriever": "Curly-Coated Retriever", | |
| "flat coated retriever": "Flat-Coated Retriever", | |
| "yorkshire terrier": "Yorkshire Terrier", | |
| "welsh springer spaniel": "Welsh Springer Spaniel", | |
| "english springer": "English Springer Spaniel", | |
| "standard poodle": "Poodle (Standard)", | |
| "miniature poodle": "Poodle (Miniature)", | |
| "toy poodle": "Poodle (Toy)", | |
| "bluetick": "Bluetick Coonhound", | |
| "walker Hound": "Treeing Walker Coonhound", | |
| "clumber": "Clumber Spaniel", | |
| "wire haired fox terrier": "Wire Fox Terrier" | |
| } | |
| def _precompute_model_to_akc_map( | |
| model_labels: List[str], | |
| akc_display_by_norm: Dict[str, str] | |
| ) -> Tuple[Dict[str, str], List[str]]: | |
| """ | |
| Build a one-to-one map: raw model label -> AKC display name. | |
| Returns (mapping, unmapped_list) | |
| """ | |
| model2akc: Dict[str, str] = {} | |
| unmapped: List[str] = [] | |
| for raw in model_labels: | |
| norm = _canonical_norm(raw) | |
| # 1) direct | |
| if norm in akc_display_by_norm: | |
| model2akc[raw] = akc_display_by_norm[norm] | |
| continue | |
| # 2) alias | |
| alias = ALIAS_DIRECT.get(norm) | |
| if alias: | |
| alias_norm = _canonical_norm(alias) | |
| if alias_norm in akc_display_by_norm: | |
| model2akc[raw] = akc_display_by_norm[alias_norm] | |
| continue | |
| # 3) safe variant flip ("toy poodle" -> "Poodle (Toy)") | |
| parts = norm.split(" ", 1) | |
| if len(parts) == 2 and parts[0] in SIZE_VARIANTS: | |
| flipped_display = f"{parts[1].title()} ({parts[0].title()})" | |
| f_norm = _canonical_norm(flipped_display) | |
| if f_norm in akc_display_by_norm: | |
| model2akc[raw] = akc_display_by_norm[f_norm] | |
| continue | |
| # 4) strip trailing generic tokens and try again | |
| stripped_set = { | |
| norm, | |
| re.sub(r"\bdog\b$", "", norm).strip(), | |
| re.sub(r"\bterrier\b$", "", norm).strip(), | |
| re.sub(r"\bhound\b$", "", norm).strip(), | |
| } | |
| hit = next((akc_display_by_norm[k] for k in stripped_set if k in akc_display_by_norm), None) | |
| if hit: | |
| model2akc[raw] = hit | |
| continue | |
| # 5) fuzzy (final resort; tight cutoff) | |
| keys = list(akc_display_by_norm.keys()) | |
| cand = get_close_matches(norm, keys, n=1, cutoff=0.87) | |
| if cand: | |
| model2akc[raw] = akc_display_by_norm[cand[0]] | |
| else: | |
| unmapped.append(raw) | |
| return model2akc, unmapped | |
| MODEL2AKC_MAP, _UNMAPPED = _precompute_model_to_akc_map(MODEL_LABELS, akc_display_by_norm) | |
| if _UNMAPPED: | |
| print(f"[DogBreedID] Unmapped model labels ({len(_UNMAPPED)}): {sorted(set(_UNMAPPED))}") | |
| #------------------- 3) Load inference pipeline---------------------------- | |
| clf = pipeline( | |
| task="image-classification", | |
| model=MODEL_ID | |
| ) | |
| # ------------------- 4) UI / inference helpers --------------------------- | |
| def _row_markdown(row: pd.Series) -> str: | |
| # Render AKC row as markdown | |
| parts = [] | |
| for col in row.index: | |
| if col == "breed": | |
| continue | |
| val = row[col] | |
| if pd.isna(val): | |
| continue | |
| text = str(val).strip() | |
| if not text: | |
| continue | |
| parts.append(f"**{col.replace('_', ' ').title()}:** {text}") | |
| return "\n\n".join(parts) if parts else "_No extra AKC info available._" | |
| def _lookup_row_by_display_name(akc_display: str) -> Optional[pd.Series]: | |
| key = _canonical_norm(akc_display) | |
| idx = akc_name_to_idx.get(key) | |
| if idx is None: | |
| return None | |
| try: | |
| return akc_df.iloc[idx] | |
| except Exception: | |
| return None | |
| def predict(image: Image.Image) -> str: | |
| try: | |
| preds = clf(image, top_k=TOP_K) | |
| except Exception: | |
| traceback.print_exc() | |
| return "Inference error. Check model/requirements." | |
| # Build table of predictions, mapped names, and AKC info for top-1 | |
| if not preds: | |
| return "No predictions." | |
| lines = ["# Predictions"] | |
| # Top-1 detailed info | |
| top = preds[0] | |
| raw_label = top.get("label", "Unknown") | |
| score = float(top.get("score", 0.0)) | |
| akc_display = MODEL2AKC_MAP.get(raw_label) | |
| header = f"**Model:** {raw_label} | **Confidence:** {score:.2%}" | |
| if akc_display: | |
| header += f"\n\n**AKC Match:** {akc_display}" | |
| row = _lookup_row_by_display_name(akc_display) | |
| if row is not None: | |
| lines.append(header) | |
| lines.append("\n" + _row_markdown(row)) | |
| else: | |
| lines.append(header + "\n\n_AKC row not found._") | |
| else: | |
| lines.append(header + "\n\n_No AKC match found (check alias rules)._") | |
| # Top-K summary table | |
| lines.append("\n---\n") | |
| lines.append("### Top Matches") | |
| lines.append("| Rank | Model Label | Confidence | AKC Match |") | |
| lines.append("|---:|---|---:|---|") | |
| for i, p in enumerate(preds, start=1): | |
| lbl = p.get("label", "Unknown") | |
| sc = float(p.get("score", 0.0)) | |
| akc_match = MODEL2AKC_MAP.get(lbl, "—") | |
| lines.append(f"| {i} | {lbl} | {sc:.2%} | {akc_match} |") | |
| return "\n".join(lines) | |
| # ----------- | |
| # 5) Gradio UI | |
| # ----------- | |
| CSS = """ | |
| #app {max-width: 980px; margin: auto;} | |
| """ | |
| with gr.Blocks(css=CSS, fill_height=True) as demo: | |
| gr.Markdown("# Dog Breed ID + AKC Info") | |
| gr.Markdown( | |
| f"Upload an image of a dog. The app predicts the breed using '{MODEL_ID}' " | |
| "and shows breed details from the American Kennel Club dataset. Dataset: https://github.com/tmfilho/akcdata/blob/master/data/akc-data-latest.csv" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| inp = gr.Image(label="Dog image", type="pil") | |
| btn = gr.Button("Predict", variant="primary") | |
| with gr.Column(scale=1): | |
| out = gr.Markdown() | |
| btn.click(fn=predict, inputs=inp, outputs=out) | |
| if __name__ == "__main__": | |
| demo.launch() | |