Spaces:
Sleeping
Sleeping
| # app.py | |
| import streamlit as st | |
| from PIL import Image | |
| import io | |
| import torch | |
| from transformers import CLIPProcessor, CLIPModel | |
| import os | |
| import os | |
| import streamlit as st | |
| from PIL import Image | |
| import torch | |
| from transformers import CLIPProcessor, CLIPModel | |
| import io | |
| # β Fix for Hugging Face Spaces: use writable /tmp directory | |
| cache_dir = "/tmp/models" | |
| os.environ["TRANSFORMERS_CACHE"] = cache_dir | |
| os.environ["HF_HOME"] = cache_dir | |
| os.makedirs(cache_dir, exist_ok=True) | |
| # Set a writable cache directory (inside the app folder) | |
| os.environ["TRANSFORMERS_CACHE"] = "./models" | |
| os.environ["HF_HOME"] = "./models" | |
| os.makedirs("./models", exist_ok=True) | |
| st.set_page_config(page_title="Hotdog or Not Hotdog", layout="centered") | |
| def load_model(): | |
| model_id = "openai/clip-vit-base-patch32" | |
| model = CLIPModel.from_pretrained(model_id) | |
| processor = CLIPProcessor.from_pretrained(model_id) | |
| return model, processor | |
| model, processor = load_model() | |
| def predict_hotdog(image: Image.Image, texts=("a photo of a hotdog", "a photo of not a hotdog")): | |
| """ | |
| Returns a tuple (probs, logits_per_image, labels) where probs is a softmax over the labels provided. | |
| """ | |
| # convert to RGB just in case | |
| image = image.convert("RGB") | |
| inputs = processor(text=list(texts), images=image, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # CLIP outputs logits_per_image: shape (batch_image, batch_text) | |
| logits_per_image = outputs.logits_per_image # image x text | |
| probs = logits_per_image.softmax(dim=1).squeeze(0).cpu().numpy() | |
| return probs, logits_per_image.squeeze(0).cpu().numpy(), list(texts) | |
| # UI | |
| st.title("π Hotdog or Not Hotdog") | |
| st.write("Upload an image and the model will tell you whether it's a hotdog (and how confident it is).") | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| uploaded = st.file_uploader("Upload image", type=["jpg", "jpeg", "png"]) | |
| st.caption("Try pictures of hotdogs, sandwiches, or anything weird.") | |
| with col2: | |
| st.markdown("**Options**") | |
| threshold = st.slider("Hotdog confidence threshold", 0.5, 0.95, 0.70, 0.01) | |
| more_labels = st.checkbox("Use more descriptive labels (may improve robustness)", value=True) | |
| if more_labels: | |
| texts = ( | |
| "a photo of a hotdog", | |
| "a photo of a hot dog on a bun", | |
| "a picture of a frankfurter", | |
| "a photo of not a hotdog", | |
| "a picture of a sandwich", | |
| "a photo of a burger", | |
| "a photo of a plate of salad", | |
| ) | |
| else: | |
| texts = ("a photo of a hotdog", "a photo of not a hotdog") | |
| if uploaded is not None: | |
| image = Image.open(io.BytesIO(uploaded.read())) | |
| st.image(image, caption="Uploaded image", use_column_width=True) | |
| with st.spinner("Running CLIP..."): | |
| probs, logits, labels = predict_hotdog(image, texts=texts) | |
| # Find indices corresponding to any "hotdog" label vs others. | |
| # We'll treat the first N_hot labels as "hotdog" if we used descriptive labels. | |
| hotdog_label_indices = [i for i, t in enumerate(labels) if "hotdog" in t or "hot dog" in t or "frankfurter" in t] | |
| other_indices = [i for i in range(len(labels)) if i not in hotdog_label_indices] | |
| # Aggregate probabilities | |
| hotdog_prob = float(probs[hotdog_label_indices].sum()) if hotdog_label_indices else float(probs[0]) | |
| not_hotdog_prob = float(probs[other_indices].sum()) if other_indices else float(1.0 - hotdog_prob) | |
| st.markdown("### Results") | |
| st.write(f"**Hotdog score:** {hotdog_prob:.3f}") | |
| st.write(f"**Not-hotdog score:** {not_hotdog_prob:.3f}") | |
| if hotdog_prob >= threshold: | |
| st.success(f"π Hotdog detected (confidence {hotdog_prob:.2%})") | |
| else: | |
| st.info(f"π« Not a hotdog (hotdog confidence {hotdog_prob:.2%})") | |
| st.markdown("---") | |
| st.markdown("#### Label breakdown (model text labels -> probability)") | |
| # show top 6 labels sorted by prob | |
| sorted_idx = probs.argsort()[::-1] | |
| for i in sorted_idx[:6]: | |
| st.write(f"- **{labels[i]}** β {probs[i]:.3f}") | |
| st.markdown("#### Debug: raw logits (optional)") | |
| if st.checkbox("Show raw logits"): | |
| for i, lbl in enumerate(labels): | |
| st.write(f"{lbl}: logits={logits[i]:.4f}, prob={probs[i]:.4f}") | |
| else: | |
| st.info("Upload an image to start. Or try the example images in your machine.") | |
| st.caption("Tip: If you want to run from your camera locally, use the 'Browse files' dropdown in Streamlit and select camera capture (browser dependent).") | |
| st.markdown("---") | |
| st.markdown("**How it works**: The app uses CLIP to compute similarity between the uploaded image and a set of text descriptions. The text descriptions that mention hotdog are summed to produce a hotdog score. CLIP is robust but not perfect β try different angles/zoom levels for best results.") | |