File size: 10,553 Bytes
1b032e2
 
9edd4a6
 
 
1cf2791
9edd4a6
1b032e2
 
 
 
 
 
 
 
 
 
 
 
 
fad54ba
 
 
1b032e2
 
 
 
 
fad54ba
 
 
1b032e2
 
fad54ba
1b032e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fad54ba
 
 
1b032e2
 
 
 
 
 
 
 
 
 
 
 
 
fad54ba
1b032e2
 
 
 
 
fad54ba
1b032e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98e416f
 
fb7332c
 
1b032e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fad54ba
1b032e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fad54ba
 
 
1b032e2
 
 
 
 
fad54ba
 
1b032e2
fad54ba
1b032e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a658145
fad54ba
1b032e2
a658145
 
1b032e2
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# app.py
# Hugging Face Space: Dog breed classifier with AKC data join
#  a. Loads a vision classifier (image -> breed label)
#  b. Precomputes a robust mapping from model labels (dogmodelbreedlist.json)
#     to AKC display names (akc-data-latest.csv), including variant-flip
#     ("Standard Poodle" vs. "Poodle (Standard)") and a small alias table.
#  c. Uses the mapping at inference time so results are fast and consistent.

import os
import re
import json
import traceback
from typing import List, Dict, Tuple, Optional
import gradio as gr
import pandas as pd
from PIL import Image
from difflib import get_close_matches
from unicodedata import normalize as _ud_norm
from transformers import pipeline


# -----------------------Configuration--------------

MODEL_ID = os.getenv("MODEL_ID", "valentinocc/dog-breed-classifier")  # image-classification model
DOG_LABELS_PATH = os.getenv("DOG_LABELS_PATH", "dogmodelbreedlist.json")
AKC_CSV_PATH = os.getenv("AKC_CSV_PATH", "akc-data-latest.csv")
TOP_K = int(os.getenv("TOP_K", "5"))


# ----------------1) AKC CSV load + breed indexing----------------

def _choose_akc_breed_col(df: pd.DataFrame) -> str:
    """
    pick the AKC breed column.
    Prefer columns containing 'breed', else a 'name'ish column, else first object column.
    """
    cols = list(df.columns)
    lower = [c.lower() for c in cols]

    # strong preferred
    for c in cols:
        if "breed" in c.lower():
            return c
    # fallback
    for c in cols:
        cl = c.lower()
        if cl in {"name", "breed_name", "title", "akc_breed"} or "name" in cl:
            return c
    # last resort: first likely string column
    for c in cols:
        if pd.api.types.is_object_dtype(df[c]):
            return c
    # absolute fallback
    return cols[0]

def _canonical_norm(s: str) -> str:
    """
    Strong key normalizer: strip accents, lowercase, collapse punctuation/spaces.
    """
    s = _ud_norm("NFKD", str(s)).encode("ascii", "ignore").decode("ascii")
    s = s.lower().strip()
    s = re.sub(r"[’'`]", "", s)
    s = re.sub(r"[-–—_/]", " ", s)
    s = re.sub(r"[()]", " ", s)
    s = re.sub(r"[^a-z0-9& ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _load_akc_table(path: str) -> Tuple[pd.DataFrame, Dict[str, int], Dict[str, str]]:
    """
    Load AKC CSV and return:
      - DataFrame
      - name->row_index map using normalized keys
      - norm_key->display_name map including both "Base (Variant)" and "Variant Base"
    """
    df = pd.read_csv(path)
    breed_col = _choose_akc_breed_col(df)
    df = df.copy()
    df.rename(columns={breed_col: "breed"}, inplace=True)

    # Build direct and "variant flipped" lookup keys
    akc_display_by_norm: Dict[str, str] = {}
    akc_name_to_idx: Dict[str, int] = {}

    for i, name in enumerate(df["breed"].astype(str).tolist()):
        n = _canonical_norm(name)
        akc_display_by_norm[n] = name
        akc_name_to_idx[n] = i

        # flip "Poodle (Standard)" -> "standard poodle"
        m = re.match(r"^(.*)\s\(([^)]+)\)$", name.strip())
        if m:
            base, var = m.group(1), m.group(2)
            flip = _canonical_norm(f"{var} {base}")
            akc_display_by_norm.setdefault(flip, name)
            akc_name_to_idx.setdefault(flip, i)

    return df, akc_name_to_idx, akc_display_by_norm

akc_df, akc_name_to_idx, akc_display_by_norm = _load_akc_table(AKC_CSV_PATH)


# -------------2) Model label list + precomputed mapping to increase speed---------------

def _read_model_labels(path: str) -> List[str]:
    with open(path, "r") as f:
        j = json.load(f)
    if isinstance(j, dict) and "id2label" in j:
        return list(j["id2label"].values())
    if isinstance(j, dict) and "labels" in j:
        return j["labels"]
    if isinstance(j, list):
        return j
    raise ValueError("dogmodelbreedlist.json must be a list or have id2label/labels")

MODEL_LABELS: List[str] = _read_model_labels(DOG_LABELS_PATH)

# Account for common size/variety tokens used in AKC naming
SIZE_VARIANTS = {
    "toy", "miniature", "standard", "giant", "medium", "small", "large",
    "smooth", "wire", "longhaired", "shorthaired", "wirehaired"
}

# Focused alias list for known troublemakers
ALIAS_DIRECT: Dict[str, str] = {
    "eskimo dog": "American Eskimo Dog",
    "wire haired fox terrier": "Fox Terrier (Wire)",
    "smooth fox terrier": "Fox Terrier (Smooth)",
    "black and tan coonhound": "Black and Tan Coonhound",
    "german short haired pointer": "German Shorthaired Pointer",
    "german long haired pointer": "German Longhaired Pointer",
    "curly coated retriever": "Curly-Coated Retriever",
    "flat coated retriever": "Flat-Coated Retriever",
    "yorkshire terrier": "Yorkshire Terrier",
    "welsh springer spaniel": "Welsh Springer Spaniel",
    "english springer": "English Springer Spaniel",
    "standard poodle": "Poodle (Standard)",
    "miniature poodle": "Poodle (Miniature)",
    "toy poodle": "Poodle (Toy)",
    "bluetick": "Bluetick Coonhound",
    "walker Hound": "Treeing Walker Coonhound",
    "clumber": "Clumber Spaniel",
    "wire haired fox terrier": "Wire Fox Terrier"
}

def _precompute_model_to_akc_map(
    model_labels: List[str],
    akc_display_by_norm: Dict[str, str]
) -> Tuple[Dict[str, str], List[str]]:
    """
    Build a one-to-one map: raw model label -> AKC display name.
    Returns (mapping, unmapped_list)
    """
    model2akc: Dict[str, str] = {}
    unmapped: List[str] = []

    for raw in model_labels:
        norm = _canonical_norm(raw)

        # 1) direct
        if norm in akc_display_by_norm:
            model2akc[raw] = akc_display_by_norm[norm]
            continue

        # 2) alias
        alias = ALIAS_DIRECT.get(norm)
        if alias:
            alias_norm = _canonical_norm(alias)
            if alias_norm in akc_display_by_norm:
                model2akc[raw] = akc_display_by_norm[alias_norm]
                continue

        # 3) safe variant flip ("toy poodle" -> "Poodle (Toy)")
        parts = norm.split(" ", 1)
        if len(parts) == 2 and parts[0] in SIZE_VARIANTS:
            flipped_display = f"{parts[1].title()} ({parts[0].title()})"
            f_norm = _canonical_norm(flipped_display)
            if f_norm in akc_display_by_norm:
                model2akc[raw] = akc_display_by_norm[f_norm]
                continue

        # 4) strip trailing generic tokens and try again
        stripped_set = {
            norm,
            re.sub(r"\bdog\b$", "", norm).strip(),
            re.sub(r"\bterrier\b$", "", norm).strip(),
            re.sub(r"\bhound\b$", "", norm).strip(),
        }
        hit = next((akc_display_by_norm[k] for k in stripped_set if k in akc_display_by_norm), None)
        if hit:
            model2akc[raw] = hit
            continue

        # 5) fuzzy (final resort; tight cutoff)
        keys = list(akc_display_by_norm.keys())
        cand = get_close_matches(norm, keys, n=1, cutoff=0.87)
        if cand:
            model2akc[raw] = akc_display_by_norm[cand[0]]
        else:
            unmapped.append(raw)

    return model2akc, unmapped

MODEL2AKC_MAP, _UNMAPPED = _precompute_model_to_akc_map(MODEL_LABELS, akc_display_by_norm)
if _UNMAPPED:
    print(f"[DogBreedID] Unmapped model labels ({len(_UNMAPPED)}): {sorted(set(_UNMAPPED))}")


#------------------- 3) Load inference pipeline----------------------------

clf = pipeline(
    task="image-classification",
    model=MODEL_ID
)


# ------------------- 4) UI / inference helpers ---------------------------
def _row_markdown(row: pd.Series) -> str:
    # Render AKC row as markdown
    parts = []
    for col in row.index:
        if col == "breed":
            continue
        val = row[col]
        if pd.isna(val):
            continue
        text = str(val).strip()
        if not text:
            continue
        parts.append(f"**{col.replace('_', ' ').title()}:** {text}")
    return "\n\n".join(parts) if parts else "_No extra AKC info available._"

def _lookup_row_by_display_name(akc_display: str) -> Optional[pd.Series]:
    key = _canonical_norm(akc_display)
    idx = akc_name_to_idx.get(key)
    if idx is None:
        return None
    try:
        return akc_df.iloc[idx]
    except Exception:
        return None

def predict(image: Image.Image) -> str:
    try:
        preds = clf(image, top_k=TOP_K)
    except Exception:
        traceback.print_exc()
        return "Inference error. Check model/requirements."

    # Build table of predictions, mapped names, and AKC info for top-1
    if not preds:
        return "No predictions."

    lines = ["# Predictions"]
    # Top-1 detailed info
    top = preds[0]
    raw_label = top.get("label", "Unknown")
    score = float(top.get("score", 0.0))

    akc_display = MODEL2AKC_MAP.get(raw_label)
    header = f"**Model:** {raw_label}  |  **Confidence:** {score:.2%}"
    if akc_display:
        header += f"\n\n**AKC Match:** {akc_display}"
        row = _lookup_row_by_display_name(akc_display)
        if row is not None:
            lines.append(header)
            lines.append("\n" + _row_markdown(row))
        else:
            lines.append(header + "\n\n_AKC row not found._")
    else:
        lines.append(header + "\n\n_No AKC match found (check alias rules)._")

    # Top-K summary table
    lines.append("\n---\n")
    lines.append("### Top Matches")
    lines.append("| Rank | Model Label | Confidence | AKC Match |")
    lines.append("|---:|---|---:|---|")
    for i, p in enumerate(preds, start=1):
        lbl = p.get("label", "Unknown")
        sc = float(p.get("score", 0.0))
        akc_match = MODEL2AKC_MAP.get(lbl, "—")
        lines.append(f"| {i} | {lbl} | {sc:.2%} | {akc_match} |")

    return "\n".join(lines)

# -----------
# 5) Gradio UI
# -----------
CSS = """
#app {max-width: 980px; margin: auto;}
"""

with gr.Blocks(css=CSS, fill_height=True) as demo:
    gr.Markdown("# Dog Breed ID + AKC Info")
    gr.Markdown(
        f"Upload an image of a dog. The app predicts the breed using '{MODEL_ID}' "
        "and shows breed details from the American Kennel Club dataset. Dataset: https://github.com/tmfilho/akcdata/blob/master/data/akc-data-latest.csv"
    )
    with gr.Row():
        with gr.Column(scale=1):
            inp = gr.Image(label="Dog image", type="pil")
            btn = gr.Button("Predict", variant="primary")
        with gr.Column(scale=1):
            out = gr.Markdown()

    btn.click(fn=predict, inputs=inp, outputs=out)

if __name__ == "__main__":
    demo.launch()