hotdog / src /streamlit_app.py
kuldeep0204's picture
Update src/streamlit_app.py
fce5ed9 verified
# app.py
import streamlit as st
from PIL import Image
import io
import torch
from transformers import CLIPProcessor, CLIPModel
import os
import os
import streamlit as st
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import io
# βœ… Fix for Hugging Face Spaces: use writable /tmp directory
cache_dir = "/tmp/models"
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HF_HOME"] = cache_dir
os.makedirs(cache_dir, exist_ok=True)
# Set a writable cache directory (inside the app folder)
os.environ["TRANSFORMERS_CACHE"] = "./models"
os.environ["HF_HOME"] = "./models"
os.makedirs("./models", exist_ok=True)
st.set_page_config(page_title="Hotdog or Not Hotdog", layout="centered")
@st.cache_resource
def load_model():
model_id = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_id)
processor = CLIPProcessor.from_pretrained(model_id)
return model, processor
model, processor = load_model()
def predict_hotdog(image: Image.Image, texts=("a photo of a hotdog", "a photo of not a hotdog")):
"""
Returns a tuple (probs, logits_per_image, labels) where probs is a softmax over the labels provided.
"""
# convert to RGB just in case
image = image.convert("RGB")
inputs = processor(text=list(texts), images=image, return_tensors="pt", padding=True)
with torch.no_grad():
outputs = model(**inputs)
# CLIP outputs logits_per_image: shape (batch_image, batch_text)
logits_per_image = outputs.logits_per_image # image x text
probs = logits_per_image.softmax(dim=1).squeeze(0).cpu().numpy()
return probs, logits_per_image.squeeze(0).cpu().numpy(), list(texts)
# UI
st.title("🌭 Hotdog or Not Hotdog")
st.write("Upload an image and the model will tell you whether it's a hotdog (and how confident it is).")
col1, col2 = st.columns([1, 1])
with col1:
uploaded = st.file_uploader("Upload image", type=["jpg", "jpeg", "png"])
st.caption("Try pictures of hotdogs, sandwiches, or anything weird.")
with col2:
st.markdown("**Options**")
threshold = st.slider("Hotdog confidence threshold", 0.5, 0.95, 0.70, 0.01)
more_labels = st.checkbox("Use more descriptive labels (may improve robustness)", value=True)
if more_labels:
texts = (
"a photo of a hotdog",
"a photo of a hot dog on a bun",
"a picture of a frankfurter",
"a photo of not a hotdog",
"a picture of a sandwich",
"a photo of a burger",
"a photo of a plate of salad",
)
else:
texts = ("a photo of a hotdog", "a photo of not a hotdog")
if uploaded is not None:
image = Image.open(io.BytesIO(uploaded.read()))
st.image(image, caption="Uploaded image", use_column_width=True)
with st.spinner("Running CLIP..."):
probs, logits, labels = predict_hotdog(image, texts=texts)
# Find indices corresponding to any "hotdog" label vs others.
# We'll treat the first N_hot labels as "hotdog" if we used descriptive labels.
hotdog_label_indices = [i for i, t in enumerate(labels) if "hotdog" in t or "hot dog" in t or "frankfurter" in t]
other_indices = [i for i in range(len(labels)) if i not in hotdog_label_indices]
# Aggregate probabilities
hotdog_prob = float(probs[hotdog_label_indices].sum()) if hotdog_label_indices else float(probs[0])
not_hotdog_prob = float(probs[other_indices].sum()) if other_indices else float(1.0 - hotdog_prob)
st.markdown("### Results")
st.write(f"**Hotdog score:** {hotdog_prob:.3f}")
st.write(f"**Not-hotdog score:** {not_hotdog_prob:.3f}")
if hotdog_prob >= threshold:
st.success(f"🌭 Hotdog detected (confidence {hotdog_prob:.2%})")
else:
st.info(f"🚫 Not a hotdog (hotdog confidence {hotdog_prob:.2%})")
st.markdown("---")
st.markdown("#### Label breakdown (model text labels -> probability)")
# show top 6 labels sorted by prob
sorted_idx = probs.argsort()[::-1]
for i in sorted_idx[:6]:
st.write(f"- **{labels[i]}** β€” {probs[i]:.3f}")
st.markdown("#### Debug: raw logits (optional)")
if st.checkbox("Show raw logits"):
for i, lbl in enumerate(labels):
st.write(f"{lbl}: logits={logits[i]:.4f}, prob={probs[i]:.4f}")
else:
st.info("Upload an image to start. Or try the example images in your machine.")
st.caption("Tip: If you want to run from your camera locally, use the 'Browse files' dropdown in Streamlit and select camera capture (browser dependent).")
st.markdown("---")
st.markdown("**How it works**: The app uses CLIP to compute similarity between the uploaded image and a set of text descriptions. The text descriptions that mention hotdog are summed to produce a hotdog score. CLIP is robust but not perfect β€” try different angles/zoom levels for best results.")