Spaces:

kuldeep0204
/

hotdog

Sleeping

App Files Files Community

hotdog / src /streamlit_app.py

kuldeep0204

Update src/streamlit_app.py

fce5ed9 verified 6 months ago

raw

history blame contribute delete

4.85 kB

	# app.py
	import streamlit as st
	from PIL import Image
	import io
	import torch
	from transformers import CLIPProcessor, CLIPModel
	import os
	import os
	import streamlit as st
	from PIL import Image
	import torch
	from transformers import CLIPProcessor, CLIPModel
	import io

	# ✅ Fix for Hugging Face Spaces: use writable /tmp directory
	cache_dir = "/tmp/models"
	os.environ["TRANSFORMERS_CACHE"] = cache_dir
	os.environ["HF_HOME"] = cache_dir
	os.makedirs(cache_dir, exist_ok=True)


	# Set a writable cache directory (inside the app folder)
	os.environ["TRANSFORMERS_CACHE"] = "./models"
	os.environ["HF_HOME"] = "./models"
	os.makedirs("./models", exist_ok=True)

	st.set_page_config(page_title="Hotdog or Not Hotdog", layout="centered")

	@st.cache_resource
	def load_model():
	model_id = "openai/clip-vit-base-patch32"
	model = CLIPModel.from_pretrained(model_id)
	processor = CLIPProcessor.from_pretrained(model_id)
	return model, processor

	model, processor = load_model()

	def predict_hotdog(image: Image.Image, texts=("a photo of a hotdog", "a photo of not a hotdog")):
	"""
	Returns a tuple (probs, logits_per_image, labels) where probs is a softmax over the labels provided.
	"""
	# convert to RGB just in case
	image = image.convert("RGB")
	inputs = processor(text=list(texts), images=image, return_tensors="pt", padding=True)
	with torch.no_grad():
	outputs = model(**inputs)
	# CLIP outputs logits_per_image: shape (batch_image, batch_text)
	logits_per_image = outputs.logits_per_image # image x text
	probs = logits_per_image.softmax(dim=1).squeeze(0).cpu().numpy()
	return probs, logits_per_image.squeeze(0).cpu().numpy(), list(texts)

	# UI
	st.title("🌭 Hotdog or Not Hotdog")
	st.write("Upload an image and the model will tell you whether it's a hotdog (and how confident it is).")

	col1, col2 = st.columns([1, 1])
	with col1:
	uploaded = st.file_uploader("Upload image", type=["jpg", "jpeg", "png"])
	st.caption("Try pictures of hotdogs, sandwiches, or anything weird.")
	with col2:
	st.markdown("Options")
	threshold = st.slider("Hotdog confidence threshold", 0.5, 0.95, 0.70, 0.01)
	more_labels = st.checkbox("Use more descriptive labels (may improve robustness)", value=True)

	if more_labels:
	texts = (
	"a photo of a hotdog",
	"a photo of a hot dog on a bun",
	"a picture of a frankfurter",
	"a photo of not a hotdog",
	"a picture of a sandwich",
	"a photo of a burger",
	"a photo of a plate of salad",
	)
	else:
	texts = ("a photo of a hotdog", "a photo of not a hotdog")

	if uploaded is not None:
	image = Image.open(io.BytesIO(uploaded.read()))
	st.image(image, caption="Uploaded image", use_column_width=True)

	with st.spinner("Running CLIP..."):
	probs, logits, labels = predict_hotdog(image, texts=texts)

	# Find indices corresponding to any "hotdog" label vs others.
	# We'll treat the first N_hot labels as "hotdog" if we used descriptive labels.
	hotdog_label_indices = [i for i, t in enumerate(labels) if "hotdog" in t or "hot dog" in t or "frankfurter" in t]
	other_indices = [i for i in range(len(labels)) if i not in hotdog_label_indices]

	# Aggregate probabilities
	hotdog_prob = float(probs[hotdog_label_indices].sum()) if hotdog_label_indices else float(probs[0])
	not_hotdog_prob = float(probs[other_indices].sum()) if other_indices else float(1.0 - hotdog_prob)

	st.markdown("### Results")
	st.write(f"Hotdog score: {hotdog_prob:.3f}")
	st.write(f"Not-hotdog score: {not_hotdog_prob:.3f}")

	if hotdog_prob >= threshold:
	st.success(f"🌭 Hotdog detected (confidence {hotdog_prob:.2%})")
	else:
	st.info(f"🚫 Not a hotdog (hotdog confidence {hotdog_prob:.2%})")

	st.markdown("---")
	st.markdown("#### Label breakdown (model text labels -> probability)")
	# show top 6 labels sorted by prob
	sorted_idx = probs.argsort()[::-1]
	for i in sorted_idx[:6]:
	st.write(f"- {labels[i]} — {probs[i]:.3f}")

	st.markdown("#### Debug: raw logits (optional)")
	if st.checkbox("Show raw logits"):
	for i, lbl in enumerate(labels):
	st.write(f"{lbl}: logits={logits[i]:.4f}, prob={probs[i]:.4f}")

	else:
	st.info("Upload an image to start. Or try the example images in your machine.")
	st.caption("Tip: If you want to run from your camera locally, use the 'Browse files' dropdown in Streamlit and select camera capture (browser dependent).")

	st.markdown("---")
	st.markdown("How it works: The app uses CLIP to compute similarity between the uploaded image and a set of text descriptions. The text descriptions that mention hotdog are summed to produce a hotdog score. CLIP is robust but not perfect — try different angles/zoom levels for best results.")