Spaces:

amar6de2
/

VisionBite

Sleeping

App Files Files Community

amar6de2 commited on Jul 1, 2025

Commit

6778ab8

verified ·

1 Parent(s): 01a5409

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -48

app.py CHANGED Viewed

@@ -1,65 +1,114 @@
-import torch
-import torch.nn.functional as F
-import torchvision.transforms as transforms
-from torchvision.models import vit_b_16
-from torchvision.transforms import v2
-from PIL import Image
 import gradio as gr
 import os
-# Load pretrained model
-model = vit_b_16(weights='DEFAULT')
-model.eval()
-# Transformation for ViT
-vit_transforms = v2.Compose([
-    v2.Resize((224, 224)),
-    v2.ToImage(),  # Ensure proper image type
-    v2.ToDtype(torch.float32, scale=True),
-    v2.Normalize(mean=[0.485, 0.456, 0.406],
-                 std=[0.229, 0.224, 0.225]),
-])
-# Class labels (example)
-class_labels = [f"Class {i}" for i in range(1000)]  # Replace with actual class names if you have them
-def predict(img):
-    # Defensive: Ensure image is PIL
-    if isinstance(img, torch.Tensor):
-        raise ValueError("Expected PIL.Image, got torch.Tensor.")
-    elif isinstance(img, np.ndarray):
-        img = Image.fromarray(img)
-    elif not isinstance(img, Image.Image):
-        raise ValueError("Input is not a valid PIL image")
-    # Transform and run through model
-    img_tensor = vit_transforms(img).unsqueeze(0)
-    with torch.no_grad():
-        outputs = model(img_tensor)
-        probs = F.softmax(outputs[0], dim=0)
-    top5 = torch.topk(probs, 5)
-    results = {class_labels[i]: float(probs[i]) for i in top5.indices}
-    return results
-# Set up Gradio interface
-image_input = gr.Image(type="pil", label="Upload JPEG Image")
-label_output = gr.Label(num_top_classes=5)
-example_images = ["images/sample1.jpg", "images/sample2.jpg"]
-example_images = [img for img in example_images if os.path.exists(img)]  # filter missing files
 demo = gr.Interface(
     fn=predict,
-    inputs=image_input,
-    outputs=label_output,
-    examples=example_images,
-    title="ViT Image Classifier",
-    description="Upload a JPEG image to classify it using Vision Transformer (ViT-B16)."
 )
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)

+### 1. Imports and class names setup ###
 import gradio as gr
 import os
+import torch
+import numpy as np
+from PIL import Image
+from model import create_vit_model  # Make sure this function exists in model.py
+from timeit import default_timer as timer
+from typing import Tuple, Dict
+# Setup class names (or hardcode them if needed)
+class_names = ["apple_pie", "baby_back_ribs", "baklava", "beef_carpaccio", "beef_tartare", "beet_salad",
+               "beignets", "bibimbap", "biryani", "bread_pudding", "breakfast_burrito", "bruschetta",
+               "caesar_salad", "cannoli", "caprese_salad", "carrot_cake", "ceviche", "chai", "chapati",
+               "cheese_plate", "cheesecake", "chicken_curry", "chicken_quesadilla", "chicken_wings",
+               "chocolate_cake", "chocolate_mousse", "chole_bhature", "churros", "clam_chowder",
+               "club_sandwich", "crab_cakes", "creme_brulee", "croque_madame", "cup_cakes", "dabeli",
+               "dal", "deviled_eggs", "dhokla", "donuts", "dosa", "dumplings", "edamame", "eggs_benedict",
+               "escargots", "falafel", "filet_mignon", "fish_and_chips", "foie_gras", "french_fries",
+               "french_onion_soup", "french_toast", "fried_calamari", "fried_rice", "frozen_yogurt",
+               "garlic_bread", "gnocchi", "greek_salad", "grilled_cheese_sandwich", "grilled_salmon",
+               "guacamole", "gyoza", "hamburger", "hot_and_sour_soup", "hot_dog", "huevos_rancheros",
+               "hummus", "ice_cream", "idli", "jalebi", "kathi_rolls", "kofta", "kulfi", "lasagna",
+               "lobster_bisque", "lobster_roll_sandwich", "macaroni_and_cheese", "macarons", "miso_soup",
+               "momos", "mussels", "naan", "nachos", "omelette", "onion_rings", "oysters", "pad_thai",
+               "paella", "pakoda", "pancakes", "pani_puri", "panna_cotta", "panner_butter_masala",
+               "pav_bhaji", "peking_duck", "pho", "pizza", "pork_chop", "poutine", "prime_rib",
+               "pulled_pork_sandwich", "ramen", "ravioli", "red_velvet_cake", "risotto", "samosa",
+               "sashimi", "scallops", "seaweed_salad", "shrimp_and_grits", "spaghetti_bolognese",
+               "spaghetti_carbonara", "spring_rolls", "steak", "strawberry_shortcake", "sushi",
+               "tacos", "takoyaki", "tiramisu", "tuna_tartare", "vadapav", "waffles"]
+### 2. Model and transforms setup ###
+# Create the model and transforms
+vit, vit_transforms = create_vit_model(num_classes=len(class_names))
+# Load saved model weights (assumes model is trained and .pth file is in the correct path)
+vit.load_state_dict(torch.load("vit_epoch_2.pth", map_location=torch.device("cpu")))
+### 3. Prediction function ###
+def predict(img) -> Tuple[Dict[str, float], float]:
+    """Transforms and performs a prediction on img and returns prediction and time taken."""
+    from PIL import UnidentifiedImageError
+    try:
+        # Convert ndarray to PIL if needed
+        if isinstance(img, np.ndarray):
+            img = Image.fromarray(img)
+        # Catch bad image input
+        if img.mode != "RGB":
+            img = img.convert("RGB")
+        # Start timer
+        start_time = timer()
+        # Transform and add batch dimension
+        img_tensor = vit_transforms(img).unsqueeze(0)
+        # Inference
+        vit.eval()
+        with torch.inference_mode():
+            pred_probs = torch.softmax(vit(img_tensor), dim=1)
+        pred_labels_and_probs = {
+            class_names[i]: float(pred_probs[0][i])
+            for i in range(len(class_names))
+        }
+        pred_time = round(timer() - start_time, 5)
+        return pred_labels_and_probs, pred_time
+    except (UnidentifiedImageError, TypeError, ValueError) as e:
+        return {"Error": f"Invalid image input: {str(e)}"}, 0.0
+### 4. Gradio app setup ###
+# Title, description, and article text
+title = "VisionBite 🍕🥩🍣"
+description = (
+    "A Vision Transformer (ViT-Base-16) model trained to classify images of food "
+    "into 121 distinct categories. The model uses a transformer-based architecture "
+    "to extract visual features and achieve accurate classification across diverse food items."
+)
+article = (
+    "Model trained on the [Food121 dataset](https://huggingface.co/datasets/ItsNotRohit/Food121) "
+    "with 95% top-5 prediction accuracy."
+)
+# Setup example images (if available)
+if os.path.exists("examples"):
+    example_list = [["examples/" + f] for f in os.listdir("examples") if f.endswith((".jpg", ".jpeg", ".png"))]
+else:
+    example_list = []
+# Create Gradio interface
 demo = gr.Interface(
     fn=predict,
+    inputs=gr.Image(type="pil"),
+    outputs=[
+        gr.Label(num_top_classes=5, label="Top Predictions"),
+        gr.Number(label="Prediction time (s)")
+    ],
+    examples=example_list,
+    title=title,
+    description=description,
+    article=article
 )
+# Launch app
+demo.launch()