Spaces:

achase25
/

dogBreedIDTest

Sleeping

App Files Files Community

dogBreedIDTest / app.py

achase25

Update app.py

1cf2791 verified 4 months ago

raw

history blame contribute delete

10.6 kB

	# app.py
	# Hugging Face Space: Dog breed classifier with AKC data join
	# a. Loads a vision classifier (image -> breed label)
	# b. Precomputes a robust mapping from model labels (dogmodelbreedlist.json)
	# to AKC display names (akc-data-latest.csv), including variant-flip
	# ("Standard Poodle" vs. "Poodle (Standard)") and a small alias table.
	# c. Uses the mapping at inference time so results are fast and consistent.

	import os
	import re
	import json
	import traceback
	from typing import List, Dict, Tuple, Optional
	import gradio as gr
	import pandas as pd
	from PIL import Image
	from difflib import get_close_matches
	from unicodedata import normalize as _ud_norm
	from transformers import pipeline


	# -----------------------Configuration--------------

	MODEL_ID = os.getenv("MODEL_ID", "valentinocc/dog-breed-classifier") # image-classification model
	DOG_LABELS_PATH = os.getenv("DOG_LABELS_PATH", "dogmodelbreedlist.json")
	AKC_CSV_PATH = os.getenv("AKC_CSV_PATH", "akc-data-latest.csv")
	TOP_K = int(os.getenv("TOP_K", "5"))


	# ----------------1) AKC CSV load + breed indexing----------------

	def _choose_akc_breed_col(df: pd.DataFrame) -> str:
	"""
	pick the AKC breed column.
	Prefer columns containing 'breed', else a 'name'ish column, else first object column.
	"""
	cols = list(df.columns)
	lower = [c.lower() for c in cols]

	# strong preferred
	for c in cols:
	if "breed" in c.lower():
	return c
	# fallback
	for c in cols:
	cl = c.lower()
	if cl in {"name", "breed_name", "title", "akc_breed"} or "name" in cl:
	return c
	# last resort: first likely string column
	for c in cols:
	if pd.api.types.is_object_dtype(df[c]):
	return c
	# absolute fallback
	return cols[0]

	def _canonical_norm(s: str) -> str:
	"""
	Strong key normalizer: strip accents, lowercase, collapse punctuation/spaces.
	"""
	s = _ud_norm("NFKD", str(s)).encode("ascii", "ignore").decode("ascii")
	s = s.lower().strip()
	s = re.sub(r"[’'`]", "", s)
	s = re.sub(r"[-–—_/]", " ", s)
	s = re.sub(r"[()]", " ", s)
	s = re.sub(r"[^a-z0-9& ]+", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s

	def _load_akc_table(path: str) -> Tuple[pd.DataFrame, Dict[str, int], Dict[str, str]]:
	"""
	Load AKC CSV and return:
	- DataFrame
	- name->row_index map using normalized keys
	- norm_key->display_name map including both "Base (Variant)" and "Variant Base"
	"""
	df = pd.read_csv(path)
	breed_col = _choose_akc_breed_col(df)
	df = df.copy()
	df.rename(columns={breed_col: "breed"}, inplace=True)

	# Build direct and "variant flipped" lookup keys
	akc_display_by_norm: Dict[str, str] = {}
	akc_name_to_idx: Dict[str, int] = {}

	for i, name in enumerate(df["breed"].astype(str).tolist()):
	n = _canonical_norm(name)
	akc_display_by_norm[n] = name
	akc_name_to_idx[n] = i

	# flip "Poodle (Standard)" -> "standard poodle"
	m = re.match(r"^(.*)\s$([^)]+)$$", name.strip())
	if m:
	base, var = m.group(1), m.group(2)
	flip = _canonical_norm(f"{var} {base}")
	akc_display_by_norm.setdefault(flip, name)
	akc_name_to_idx.setdefault(flip, i)

	return df, akc_name_to_idx, akc_display_by_norm

	akc_df, akc_name_to_idx, akc_display_by_norm = _load_akc_table(AKC_CSV_PATH)


	# -------------2) Model label list + precomputed mapping to increase speed---------------

	def _read_model_labels(path: str) -> List[str]:
	with open(path, "r") as f:
	j = json.load(f)
	if isinstance(j, dict) and "id2label" in j:
	return list(j["id2label"].values())
	if isinstance(j, dict) and "labels" in j:
	return j["labels"]
	if isinstance(j, list):
	return j
	raise ValueError("dogmodelbreedlist.json must be a list or have id2label/labels")

	MODEL_LABELS: List[str] = _read_model_labels(DOG_LABELS_PATH)

	# Account for common size/variety tokens used in AKC naming
	SIZE_VARIANTS = {
	"toy", "miniature", "standard", "giant", "medium", "small", "large",
	"smooth", "wire", "longhaired", "shorthaired", "wirehaired"
	}

	# Focused alias list for known troublemakers
	ALIAS_DIRECT: Dict[str, str] = {
	"eskimo dog": "American Eskimo Dog",
	"wire haired fox terrier": "Fox Terrier (Wire)",
	"smooth fox terrier": "Fox Terrier (Smooth)",
	"black and tan coonhound": "Black and Tan Coonhound",
	"german short haired pointer": "German Shorthaired Pointer",
	"german long haired pointer": "German Longhaired Pointer",
	"curly coated retriever": "Curly-Coated Retriever",
	"flat coated retriever": "Flat-Coated Retriever",
	"yorkshire terrier": "Yorkshire Terrier",
	"welsh springer spaniel": "Welsh Springer Spaniel",
	"english springer": "English Springer Spaniel",
	"standard poodle": "Poodle (Standard)",
	"miniature poodle": "Poodle (Miniature)",
	"toy poodle": "Poodle (Toy)",
	"bluetick": "Bluetick Coonhound",
	"walker Hound": "Treeing Walker Coonhound",
	"clumber": "Clumber Spaniel",
	"wire haired fox terrier": "Wire Fox Terrier"
	}

	def _precompute_model_to_akc_map(
	model_labels: List[str],
	akc_display_by_norm: Dict[str, str]
	) -> Tuple[Dict[str, str], List[str]]:
	"""
	Build a one-to-one map: raw model label -> AKC display name.
	Returns (mapping, unmapped_list)
	"""
	model2akc: Dict[str, str] = {}
	unmapped: List[str] = []

	for raw in model_labels:
	norm = _canonical_norm(raw)

	# 1) direct
	if norm in akc_display_by_norm:
	model2akc[raw] = akc_display_by_norm[norm]
	continue

	# 2) alias
	alias = ALIAS_DIRECT.get(norm)
	if alias:
	alias_norm = _canonical_norm(alias)
	if alias_norm in akc_display_by_norm:
	model2akc[raw] = akc_display_by_norm[alias_norm]
	continue

	# 3) safe variant flip ("toy poodle" -> "Poodle (Toy)")
	parts = norm.split(" ", 1)
	if len(parts) == 2 and parts[0] in SIZE_VARIANTS:
	flipped_display = f"{parts[1].title()} ({parts[0].title()})"
	f_norm = _canonical_norm(flipped_display)
	if f_norm in akc_display_by_norm:
	model2akc[raw] = akc_display_by_norm[f_norm]
	continue

	# 4) strip trailing generic tokens and try again
	stripped_set = {
	norm,
	re.sub(r"\bdog\b$", "", norm).strip(),
	re.sub(r"\bterrier\b$", "", norm).strip(),
	re.sub(r"\bhound\b$", "", norm).strip(),
	}
	hit = next((akc_display_by_norm[k] for k in stripped_set if k in akc_display_by_norm), None)
	if hit:
	model2akc[raw] = hit
	continue

	# 5) fuzzy (final resort; tight cutoff)
	keys = list(akc_display_by_norm.keys())
	cand = get_close_matches(norm, keys, n=1, cutoff=0.87)
	if cand:
	model2akc[raw] = akc_display_by_norm[cand[0]]
	else:
	unmapped.append(raw)

	return model2akc, unmapped

	MODEL2AKC_MAP, _UNMAPPED = _precompute_model_to_akc_map(MODEL_LABELS, akc_display_by_norm)
	if _UNMAPPED:
	print(f"[DogBreedID] Unmapped model labels ({len(_UNMAPPED)}): {sorted(set(_UNMAPPED))}")


	#------------------- 3) Load inference pipeline----------------------------

	clf = pipeline(
	task="image-classification",
	model=MODEL_ID
	)


	# ------------------- 4) UI / inference helpers ---------------------------
	def _row_markdown(row: pd.Series) -> str:
	# Render AKC row as markdown
	parts = []
	for col in row.index:
	if col == "breed":
	continue
	val = row[col]
	if pd.isna(val):
	continue
	text = str(val).strip()
	if not text:
	continue
	parts.append(f"{col.replace('_', ' ').title()}: {text}")
	return "\n\n".join(parts) if parts else "_No extra AKC info available._"

	def _lookup_row_by_display_name(akc_display: str) -> Optional[pd.Series]:
	key = _canonical_norm(akc_display)
	idx = akc_name_to_idx.get(key)
	if idx is None:
	return None
	try:
	return akc_df.iloc[idx]
	except Exception:
	return None

	def predict(image: Image.Image) -> str:
	try:
	preds = clf(image, top_k=TOP_K)
	except Exception:
	traceback.print_exc()
	return "Inference error. Check model/requirements."

	# Build table of predictions, mapped names, and AKC info for top-1
	if not preds:
	return "No predictions."

	lines = ["# Predictions"]
	# Top-1 detailed info
	top = preds[0]
	raw_label = top.get("label", "Unknown")
	score = float(top.get("score", 0.0))

	akc_display = MODEL2AKC_MAP.get(raw_label)
	header = f"Model: {raw_label} \| Confidence: {score:.2%}"
	if akc_display:
	header += f"\n\nAKC Match: {akc_display}"
	row = _lookup_row_by_display_name(akc_display)
	if row is not None:
	lines.append(header)
	lines.append("\n" + _row_markdown(row))
	else:
	lines.append(header + "\n\n_AKC row not found._")
	else:
	lines.append(header + "\n\n_No AKC match found (check alias rules)._")

	# Top-K summary table
	lines.append("\n---\n")
	lines.append("### Top Matches")
	lines.append("\| Rank \| Model Label \| Confidence \| AKC Match \|")
	lines.append("\|---:\|---\|---:\|---\|")
	for i, p in enumerate(preds, start=1):
	lbl = p.get("label", "Unknown")
	sc = float(p.get("score", 0.0))
	akc_match = MODEL2AKC_MAP.get(lbl, "—")
	lines.append(f"\| {i} \| {lbl} \| {sc:.2%} \| {akc_match} \|")

	return "\n".join(lines)

	# -----------
	# 5) Gradio UI
	# -----------
	CSS = """
	#app {max-width: 980px; margin: auto;}
	"""

	with gr.Blocks(css=CSS, fill_height=True) as demo:
	gr.Markdown("# Dog Breed ID + AKC Info")
	gr.Markdown(
	f"Upload an image of a dog. The app predicts the breed using '{MODEL_ID}' "
	"and shows breed details from the American Kennel Club dataset. Dataset: https://github.com/tmfilho/akcdata/blob/master/data/akc-data-latest.csv"
	)
	with gr.Row():
	with gr.Column(scale=1):
	inp = gr.Image(label="Dog image", type="pil")
	btn = gr.Button("Predict", variant="primary")
	with gr.Column(scale=1):
	out = gr.Markdown()

	btn.click(fn=predict, inputs=inp, outputs=out)

	if __name__ == "__main__":
	demo.launch()