Spaces:

achase25
/

DogBreedID

Sleeping

File size: 9,215 Bytes

import difflib
import re
from typing import Dict, Optional, Tuple

import gradio as gr
import torch
import pandas as pd
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForImageClassification

MODEL_ID = "valentinocc/dog-breed-classifier"
AKC_CSV_PATH = "akc-data-latest.csv"  
DOG_LABELS_PATH = "dogmodelbreedlist.json"


# -----------Load model + processor-----------------------
processor = AutoImageProcessor.from_pretrained(MODEL_ID)
model = AutoModelForImageClassification.from_pretrained(MODEL_ID)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ---------------Data Cleaning Helpers--------------------------------
def _normalize_name(s: str) -> str:
    #Lowercase, strip non-alphanumerics, collapse spaces
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s

def _load_akc_table(path: str) -> Tuple[pd.DataFrame, Dict[str, int]]:
    #Load AKC CSV + build a name->row_index map using a normalized breed name.

    df = pd.read_csv(path)
    name_col = "Unnamed: 0"
    if name_col not in df.columns:
        for c in df.columns:
            if df[c].dtype == "object":
                name_col = c
                break

    # Make a clean 'breed' column for display and mapping
    df = df.rename(columns={name_col: "breed"})
    df["breed"] = df["breed"].astype(str)

    # Build normalized name -> row index map
    index_map: Dict[str, int] = {}
    for idx, name in enumerate(df["breed"].tolist()):
        index_map[_normalize_name(name)] = idx

    return df, index_map

akc_df, akc_name_to_idx = _load_akc_table(AKC_CSV_PATH)


# ------------------Alias rules----------------------------------------

# 1) Direct alias corrections. This fixes issues when pairing predictions with corresponding dataset column names (normalized -> AKC display name)
ALIAS_DIRECT: Dict[str, str] = {
    # Poodles (AKC rows are usually written with parentheses)
    "standard poodle": "Poodle (Standard)",
    "miniature poodle": "Poodle (Miniature)",
    "toy poodle": "Poodle (Toy)",

    # Dachshund sizes often appear both ways in the wild
    "miniature dachshund": "Dachshund (Miniature)",
    "standard dachshund": "Dachshund (Standard)",  # present in some AKC tables

    # Bull Terrier miniature vs. base
    "miniature bull terrier": "Bull Terrier (Miniature)",

    # American Eskimo Dog varieties
    "toy american eskimo": "American Eskimo Dog (Toy)",
    "miniature american eskimo": "American Eskimo Dog (Miniature)",
    "standard american eskimo": "American Eskimo Dog (Standard)",
    "toy american eskimo dog": "American Eskimo Dog (Toy)",
    "miniature american eskimo dog": "American Eskimo Dog (Miniature)",
    "standard american eskimo dog": "American Eskimo Dog (Standard)",

    # Others
    "eskimo dog": "American Eskimo Dog",
    "wire haired fox terrier": "Fox Terrier (Wire)",
    "smooth fox terrier": "Fox Terrier (Smooth)",
    "black and tan coonhound": "Black and Tan Coonhound",
    "german short haired pointer": "German Shorthaired Pointer",
    "german long haired pointer": "German Longhaired Pointer",
    "curly coated retriever": "Curly-Coated Retriever",
    "flat coated retriever": "Flat-Coated Retriever",
    "yorkshire terrier": "Yorkshire Terrier",
    "welsh springer spaniel": "Welsh Springer Spaniel",
    "english springer": "English Springer Spaniel",
}

# 2) Generic flip:  "<Variant> <Base>"  ->  "<Base> (<Variant>)"
#    We only attempt the flip and accept it if it exists in the AKC index.
SIZE_VARIANTS = {"standard", "miniature", "toy", "giant"}

def _try_alias_then_flip(norm_label: str) -> Optional[pd.Series]:
#       Resolve aliases for common size naming and try a safe 'flip' if needed."""
    # a) direct alias table
    if norm_label in ALIAS_DIRECT:
        target = _normalize_name(ALIAS_DIRECT[norm_label])
        idx = akc_name_to_idx.get(target)
        if idx is not None:
            return akc_df.iloc[idx]

    # b) generic flip: "<variant> <rest>" -> "<rest> (<variant>)" IF that exists in AKC
    parts = norm_label.split(" ", 1)
    if len(parts) == 2:
        first, rest = parts[0], parts[1]
        if first in SIZE_VARIANTS:
            flipped_display = f"{rest.title()} ({first.title()})"
            flipped_norm = _normalize_name(flipped_display)
            idx = akc_name_to_idx.get(flipped_norm)
            if idx is not None:
                return akc_df.iloc[idx]

    return None

# ------------------Lookup in AKC table------------------------------------------

def _lookup_breed_info(pred_label: str) -> Optional[pd.Series]:
    """
    Find the best matching AKC row for a model label.
    1) Direct normalized match
    2) Alias resolution and safe variant flip ('Standard Poodle' -> 'Poodle (Standard)')
    3) Simple stripped variants (remove trailing 'dog', 'terrier', 'hound')
    4) Fuzzy match via difflib
    """
    norm = _normalize_name(pred_label)

# 1) direct match
    idx = akc_name_to_idx.get(norm)
    if idx is not None:
        return akc_df.iloc[idx]

# 2) alias + safe flip
    row = _try_alias_then_flip(norm)
    if row is not None:
        return row

# 3) simple stripped variants
    stripped_variants = {
        norm,
        re.sub(r"\bdog\b$", "", norm).strip(),
        re.sub(r"\bterrier\b$", "", norm).strip(),
        re.sub(r"\bhound\b$", "", norm).strip(),
    }
    for v in stripped_variants:
        if v in akc_name_to_idx:
            return akc_df.iloc[akc_name_to_idx[v]]

    # 4) fuzzy match
    candidates = difflib.get_close_matches(norm, akc_name_to_idx.keys(), n=1, cutoff=0.75)
    if candidates:
        return akc_df.iloc[akc_name_to_idx[candidates[0]]]

    return None

def _format_breed_info(row: pd.Series) -> str:
    #Turn a single AKC row into a readable markdown snippet.
    def get(col, fallback="—"):
        return row[col] if col in row and pd.notna(row[col]) else fallback

    lines = []
    lines.append(f"### {get('breed', 'Unknown Breed')}")
    if pd.notna(get('description')):
        lines.append(f"{get('description')}\n")

    # Facts block
    facts = []
    if pd.notna(get('group')):
        facts.append(f"**Group:** {get('group')}")
    if pd.notna(get('temperament')):
        facts.append(f"**Temperament:** {get('temperament')}")
    # Height (inches)
    hmin, hmax = get('min_height'), get('max_height')
    if pd.notna(hmin) or pd.notna(hmax):
        facts.append(f"**Height:** {hmin if pd.notna(hmin) else '—'}–{hmax if pd.notna(hmax) else '—'} in")
    # Weight (pounds)
    wmin, wmax = get('min_weight'), get('max_weight')
    if pd.notna(wmin) or pd.notna(wmax):
        facts.append(f"**Weight:** {wmin if pd.notna(wmin) else '—'}–{wmax if pd.notna(wmax) else '—'} lb")
    # Life Expectancy (years)
    emin, emax = get('min_expectancy'), get('max_expectancy')
    if pd.notna(emin) or pd.notna(emax):
        facts.append(f"**Life Expectancy:** {emin if pd.notna(emin) else '—'}–{emax if pd.notna(emax) else '—'} yrs")

    if facts:
        lines.append("\n".join(facts))

    # Optional traits if present in our AKC Dataset
    trait_fields = [
        ("grooming_frequency_category", "Grooming"),
        ("shedding_category", "Shedding"),
        ("energy_level_category", "Energy Level"),
        ("trainability_category", "Trainability"),
        ("demeanor_category", "Demeanor"),
    ]
    traits = []
    for col, label in trait_fields:
        val = get(col)
        if pd.notna(val):
            traits.append(f"- **{label}:** {val}")
    if traits:
        lines.append("\n**Traits**")
        lines.extend(traits)

    return "\n\n".join(lines).strip()

# ---------------------Inference function----------------
 #    Accepts a PIL image and returns:
 #    - Top-1 predicted breed with confidence
 #    - A markdown block of AKC info for that breed (if found)
def predict_with_info(img: Image.Image) -> str:
     
    inputs = processor(images=img, return_tensors="pt").to(device)
    with torch.inference_mode():
        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)
        top_id = int(torch.argmax(probs, dim=-1).item())
        top_prob = float(probs[0, top_id].item())

    label = model.config.id2label.get(top_id, "Unknown")
    header = f"**Prediction:** {label} ({top_prob:.2%})"

    row = _lookup_breed_info(label)
    if row is None:
        return header + "\n\n_No matching breed found in AKC dataset._"

    info_md = _format_breed_info(row)
    return header + "\n\n" + info_md


# -------------------- UI -------------------------------------

demo = gr.Interface(
    fn=predict_with_info,
    inputs=gr.Image(type="pil", label="Upload a dog photo"),
    outputs=gr.Markdown(label="Prediction + Breed Info"),
    title="Dog Breed Classifier + AKC Info",
    description=(
        f"Upload an image of a dog. The app predicts the breed using '{MODEL_ID}' "
        "and shows breed details from the American Kennel Club dataset. Dataset: https://github.com/tmfilho/akcdata/blob/master/data/akc-data-latest.csv"
    ),
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch()