Spaces:

achase25
/

dogBreedIDTest

Sleeping

File size: 10,553 Bytes

# app.py
# Hugging Face Space: Dog breed classifier with AKC data join
#  a. Loads a vision classifier (image -> breed label)
#  b. Precomputes a robust mapping from model labels (dogmodelbreedlist.json)
#     to AKC display names (akc-data-latest.csv), including variant-flip
#     ("Standard Poodle" vs. "Poodle (Standard)") and a small alias table.
#  c. Uses the mapping at inference time so results are fast and consistent.

import os
import re
import json
import traceback
from typing import List, Dict, Tuple, Optional
import gradio as gr
import pandas as pd
from PIL import Image
from difflib import get_close_matches
from unicodedata import normalize as _ud_norm
from transformers import pipeline


# -----------------------Configuration--------------

MODEL_ID = os.getenv("MODEL_ID", "valentinocc/dog-breed-classifier")  # image-classification model
DOG_LABELS_PATH = os.getenv("DOG_LABELS_PATH", "dogmodelbreedlist.json")
AKC_CSV_PATH = os.getenv("AKC_CSV_PATH", "akc-data-latest.csv")
TOP_K = int(os.getenv("TOP_K", "5"))


# ----------------1) AKC CSV load + breed indexing----------------

def _choose_akc_breed_col(df: pd.DataFrame) -> str:
    """
    pick the AKC breed column.
    Prefer columns containing 'breed', else a 'name'ish column, else first object column.
    """
    cols = list(df.columns)
    lower = [c.lower() for c in cols]

    # strong preferred
    for c in cols:
        if "breed" in c.lower():
            return c
    # fallback
    for c in cols:
        cl = c.lower()
        if cl in {"name", "breed_name", "title", "akc_breed"} or "name" in cl:
            return c
    # last resort: first likely string column
    for c in cols:
        if pd.api.types.is_object_dtype(df[c]):
            return c
    # absolute fallback
    return cols[0]

def _canonical_norm(s: str) -> str:
    """
    Strong key normalizer: strip accents, lowercase, collapse punctuation/spaces.
    """
    s = _ud_norm("NFKD", str(s)).encode("ascii", "ignore").decode("ascii")
    s = s.lower().strip()
    s = re.sub(r"[’'`]", "", s)
    s = re.sub(r"[-–—_/]", " ", s)
    s = re.sub(r"[()]", " ", s)
    s = re.sub(r"[^a-z0-9& ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _load_akc_table(path: str) -> Tuple[pd.DataFrame, Dict[str, int], Dict[str, str]]:
    """
    Load AKC CSV and return:
      - DataFrame
      - name->row_index map using normalized keys
      - norm_key->display_name map including both "Base (Variant)" and "Variant Base"
    """
    df = pd.read_csv(path)
    breed_col = _choose_akc_breed_col(df)
    df = df.copy()
    df.rename(columns={breed_col: "breed"}, inplace=True)

    # Build direct and "variant flipped" lookup keys
    akc_display_by_norm: Dict[str, str] = {}
    akc_name_to_idx: Dict[str, int] = {}

    for i, name in enumerate(df["breed"].astype(str).tolist()):
        n = _canonical_norm(name)
        akc_display_by_norm[n] = name
        akc_name_to_idx[n] = i

        # flip "Poodle (Standard)" -> "standard poodle"
        m = re.match(r"^(.*)\s\(([^)]+)\)$", name.strip())
        if m:
            base, var = m.group(1), m.group(2)
            flip = _canonical_norm(f"{var} {base}")
            akc_display_by_norm.setdefault(flip, name)
            akc_name_to_idx.setdefault(flip, i)

    return df, akc_name_to_idx, akc_display_by_norm

akc_df, akc_name_to_idx, akc_display_by_norm = _load_akc_table(AKC_CSV_PATH)


# -------------2) Model label list + precomputed mapping to increase speed---------------

def _read_model_labels(path: str) -> List[str]:
    with open(path, "r") as f:
        j = json.load(f)
    if isinstance(j, dict) and "id2label" in j:
        return list(j["id2label"].values())
    if isinstance(j, dict) and "labels" in j:
        return j["labels"]
    if isinstance(j, list):
        return j
    raise ValueError("dogmodelbreedlist.json must be a list or have id2label/labels")

MODEL_LABELS: List[str] = _read_model_labels(DOG_LABELS_PATH)

# Account for common size/variety tokens used in AKC naming
SIZE_VARIANTS = {
    "toy", "miniature", "standard", "giant", "medium", "small", "large",
    "smooth", "wire", "longhaired", "shorthaired", "wirehaired"
}

# Focused alias list for known troublemakers
ALIAS_DIRECT: Dict[str, str] = {
    "eskimo dog": "American Eskimo Dog",
    "wire haired fox terrier": "Fox Terrier (Wire)",
    "smooth fox terrier": "Fox Terrier (Smooth)",
    "black and tan coonhound": "Black and Tan Coonhound",
    "german short haired pointer": "German Shorthaired Pointer",
    "german long haired pointer": "German Longhaired Pointer",
    "curly coated retriever": "Curly-Coated Retriever",
    "flat coated retriever": "Flat-Coated Retriever",
    "yorkshire terrier": "Yorkshire Terrier",
    "welsh springer spaniel": "Welsh Springer Spaniel",
    "english springer": "English Springer Spaniel",
    "standard poodle": "Poodle (Standard)",
    "miniature poodle": "Poodle (Miniature)",
    "toy poodle": "Poodle (Toy)",
    "bluetick": "Bluetick Coonhound",
    "walker Hound": "Treeing Walker Coonhound",
    "clumber": "Clumber Spaniel",
    "wire haired fox terrier": "Wire Fox Terrier"
}

def _precompute_model_to_akc_map(
    model_labels: List[str],
    akc_display_by_norm: Dict[str, str]
) -> Tuple[Dict[str, str], List[str]]:
    """
    Build a one-to-one map: raw model label -> AKC display name.
    Returns (mapping, unmapped_list)
    """
    model2akc: Dict[str, str] = {}
    unmapped: List[str] = []

    for raw in model_labels:
        norm = _canonical_norm(raw)

        # 1) direct
        if norm in akc_display_by_norm:
            model2akc[raw] = akc_display_by_norm[norm]
            continue

        # 2) alias
        alias = ALIAS_DIRECT.get(norm)
        if alias:
            alias_norm = _canonical_norm(alias)
            if alias_norm in akc_display_by_norm:
                model2akc[raw] = akc_display_by_norm[alias_norm]
                continue

        # 3) safe variant flip ("toy poodle" -> "Poodle (Toy)")
        parts = norm.split(" ", 1)
        if len(parts) == 2 and parts[0] in SIZE_VARIANTS:
            flipped_display = f"{parts[1].title()} ({parts[0].title()})"
            f_norm = _canonical_norm(flipped_display)
            if f_norm in akc_display_by_norm:
                model2akc[raw] = akc_display_by_norm[f_norm]
                continue

        # 4) strip trailing generic tokens and try again
        stripped_set = {
            norm,
            re.sub(r"\bdog\b$", "", norm).strip(),
            re.sub(r"\bterrier\b$", "", norm).strip(),
            re.sub(r"\bhound\b$", "", norm).strip(),
        }
        hit = next((akc_display_by_norm[k] for k in stripped_set if k in akc_display_by_norm), None)
        if hit:
            model2akc[raw] = hit
            continue

        # 5) fuzzy (final resort; tight cutoff)
        keys = list(akc_display_by_norm.keys())
        cand = get_close_matches(norm, keys, n=1, cutoff=0.87)
        if cand:
            model2akc[raw] = akc_display_by_norm[cand[0]]
        else:
            unmapped.append(raw)

    return model2akc, unmapped

MODEL2AKC_MAP, _UNMAPPED = _precompute_model_to_akc_map(MODEL_LABELS, akc_display_by_norm)
if _UNMAPPED:
    print(f"[DogBreedID] Unmapped model labels ({len(_UNMAPPED)}): {sorted(set(_UNMAPPED))}")


#------------------- 3) Load inference pipeline----------------------------

clf = pipeline(
    task="image-classification",
    model=MODEL_ID
)


# ------------------- 4) UI / inference helpers ---------------------------
def _row_markdown(row: pd.Series) -> str:
    # Render AKC row as markdown
    parts = []
    for col in row.index:
        if col == "breed":
            continue
        val = row[col]
        if pd.isna(val):
            continue
        text = str(val).strip()
        if not text:
            continue
        parts.append(f"**{col.replace('_', ' ').title()}:** {text}")
    return "\n\n".join(parts) if parts else "_No extra AKC info available._"

def _lookup_row_by_display_name(akc_display: str) -> Optional[pd.Series]:
    key = _canonical_norm(akc_display)
    idx = akc_name_to_idx.get(key)
    if idx is None:
        return None
    try:
        return akc_df.iloc[idx]
    except Exception:
        return None

def predict(image: Image.Image) -> str:
    try:
        preds = clf(image, top_k=TOP_K)
    except Exception:
        traceback.print_exc()
        return "Inference error. Check model/requirements."

    # Build table of predictions, mapped names, and AKC info for top-1
    if not preds:
        return "No predictions."

    lines = ["# Predictions"]
    # Top-1 detailed info
    top = preds[0]
    raw_label = top.get("label", "Unknown")
    score = float(top.get("score", 0.0))

    akc_display = MODEL2AKC_MAP.get(raw_label)
    header = f"**Model:** {raw_label}  |  **Confidence:** {score:.2%}"
    if akc_display:
        header += f"\n\n**AKC Match:** {akc_display}"
        row = _lookup_row_by_display_name(akc_display)
        if row is not None:
            lines.append(header)
            lines.append("\n" + _row_markdown(row))
        else:
            lines.append(header + "\n\n_AKC row not found._")
    else:
        lines.append(header + "\n\n_No AKC match found (check alias rules)._")

    # Top-K summary table
    lines.append("\n---\n")
    lines.append("### Top Matches")
    lines.append("| Rank | Model Label | Confidence | AKC Match |")
    lines.append("|---:|---|---:|---|")
    for i, p in enumerate(preds, start=1):
        lbl = p.get("label", "Unknown")
        sc = float(p.get("score", 0.0))
        akc_match = MODEL2AKC_MAP.get(lbl, "—")
        lines.append(f"| {i} | {lbl} | {sc:.2%} | {akc_match} |")

    return "\n".join(lines)

# -----------
# 5) Gradio UI
# -----------
CSS = """
#app {max-width: 980px; margin: auto;}
"""

with gr.Blocks(css=CSS, fill_height=True) as demo:
    gr.Markdown("# Dog Breed ID + AKC Info")
    gr.Markdown(
        f"Upload an image of a dog. The app predicts the breed using '{MODEL_ID}' "
        "and shows breed details from the American Kennel Club dataset. Dataset: https://github.com/tmfilho/akcdata/blob/master/data/akc-data-latest.csv"
    )
    with gr.Row():
        with gr.Column(scale=1):
            inp = gr.Image(label="Dog image", type="pil")
            btn = gr.Button("Predict", variant="primary")
        with gr.Column(scale=1):
            out = gr.Markdown()

    btn.click(fn=predict, inputs=inp, outputs=out)

if __name__ == "__main__":
    demo.launch()