Spaces:

kannanmohan
/

foodvision

Runtime error

App Files Files Community

kannanmohan commited on Sep 2, 2025

Commit

080763c

1 Parent(s): b4d9017

feat: add two-phase classifier

Browse files

Files changed (14) hide show

app.py +29 -16
assets/croque_madame_0.jpg +0 -0
assets/img_0_0.jpg +0 -0
assets/img_0_1.jpg +0 -0
assets/img_1_2.jpg +0 -0
assets/pancakes_1.jpg +0 -0
assets/shrimp_and_grits_2.jpg +0 -0
models/binary_class/config.json +36 -0
models/{training_args.bin → binary_class/training_args.bin} +0 -0
models/{config.json → multi_class/config.json} +0 -0
models/multi_class/training_args.bin +0 -0
models/pre_processor.py +0 -111
models/preprocessor_config.json +13 -8
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from PIL import Image
 import torch
 from timeit import default_timer as timer
@@ -7,38 +6,52 @@ from transformers import AutoModelForImageClassification, AutoImageProcessor
 import gradio as gr
 from pathlib import Path
-# TODO - check if we go without setting this
-# AutoImageProcessor.register("CustomVisionImageProcessorV2", CustomVisionImageProcessorV2)
-model = AutoModelForImageClassification.from_pretrained("models")
-processor = AutoImageProcessor.from_pretrained("models", use_fast=False, trust_remote_code=True)
-def predict(image:Image, model:torch.nn.Module=model, pre_processor=processor)-> Tuple[Dict, float]:
     """
-    Predict the class of an image using a trained model.
     """
     start_time = timer()
-    image_trans = processor(image)
-    model.eval()
     with torch.inference_mode():
-        logits = model(image_trans['pixel_values'].unsqueeze(0)).logits
-        pred_probs = torch.softmax(logits, dim=1)
-        pred_label_prob_map = {model.config.id2label[i]: pred_probs[0][i].item() for i in range(len(pred_probs[0]))}
     elapsed_time = round(timer() - start_time, 4)
     return pred_label_prob_map, elapsed_time
 example_images = list(Path("assets").glob("*.jpg"))
 demo = gr.Interface(fn=predict,
                     inputs=gr.Image(type="pil"),
                     outputs=[
                         gr.Label(num_top_classes=5, label="Predicted class"),
-                        gr.Number(label="Prediction time"),
                     ],
                     examples=example_images,
                     title="Image Classification App",
                     description="Upload an image to predict the class of the image")
-demo.launch()

 from PIL import Image
 import torch
 from timeit import default_timer as timer
 import gradio as gr
 from pathlib import Path
+DEVICE = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() and torch.backends.mps.is_built() else 'cpu')
+BINARY_PATH   = "./models/binary_class"
+MULTICLASS_PATH = "./models/multi_class"
+processor = AutoImageProcessor.from_pretrained("./models/", use_fast=False, trust_remote_code=True)
+binary_model = AutoModelForImageClassification.from_pretrained(BINARY_PATH).to(DEVICE)
+multi_class_model = AutoModelForImageClassification.from_pretrained(MULTICLASS_PATH).to(DEVICE)
+def predict(image:Image)-> Tuple[Dict, float]:
     """
+    1. Binary pass   -> yes / no
+    2. Multi pass    -> only if binary == yes
+    Returns: (label_dict | "Not a food item", total_time)
     """
+    if image is None:
+        return "no image provided", 0.0
     start_time = timer()
+    input = processor(images=image, return_tensors="pt").to(DEVICE)
+    binary_model.eval()
+    with torch.inference_mode():
+      logits = binary_model(**input).logits.squeeze()
+      pred_probs = torch.softmax(logits, dim=-1)
+      pred_food_prob = pred_probs[binary_model.config.label2id.get("food", 1)].item()
+    if pred_food_prob < 0.5:
+        elapsed = round(timer() - start_time, 4)
+        return "Not a food item", elapsed
+    multi_class_model.eval()
     with torch.inference_mode():
+        logits = multi_class_model(**input).logits
+        pred_probs = torch.softmax(logits, dim=-1)
+        pred_label_prob_map = {multi_class_model.config.id2label[i]: pred_probs[0][i].item() for i in range(len(pred_probs[0]))}
     elapsed_time = round(timer() - start_time, 4)
     return pred_label_prob_map, elapsed_time
+# get the example images from assets folder
 example_images = list(Path("assets").glob("*.jpg"))
 demo = gr.Interface(fn=predict,
                     inputs=gr.Image(type="pil"),
                     outputs=[
                         gr.Label(num_top_classes=5, label="Predicted class"),
+                        gr.Number(label="Prediction time (sec)"),
                     ],
                     examples=example_images,
                     title="Image Classification App",
                     description="Upload an image to predict the class of the image")
+demo.launch()

assets/croque_madame_0.jpg DELETED Viewed

Binary file (46.6 kB)

assets/img_0_0.jpg ADDED Viewed

assets/img_0_1.jpg ADDED Viewed

assets/img_1_2.jpg ADDED Viewed

assets/pancakes_1.jpg DELETED Viewed

Binary file (45 kB)

assets/shrimp_and_grits_2.jpg DELETED Viewed

Binary file (54.5 kB)

models/binary_class/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "architectures": [
+    "ViTForImageClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoModelForImageClassification": "transformers.ViTForImageClassification"
+  },
+  "encoder_stride": 16,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "non-food",
+    "1": "food"
+  },
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "food": 1,
+    "non-food": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "model_type": "vit",
+  "num_attention_heads": 12,
+  "num_channels": 3,
+  "num_hidden_layers": 12,
+  "patch_size": 16,
+  "pooler_act": "tanh",
+  "pooler_output_size": 768,
+  "problem_type": "single_label_classification",
+  "qkv_bias": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.2"
+}

models/{training_args.bin → binary_class/training_args.bin} RENAMED Viewed

Binary files a/models/training_args.bin and b/models/binary_class/training_args.bin differ

models/{config.json → multi_class/config.json} RENAMED Viewed

File without changes

models/multi_class/training_args.bin ADDED Viewed

Binary file (5.78 kB). View file

models/pre_processor.py DELETED Viewed

@@ -1,111 +0,0 @@
-from typing import Any, Dict, List
-import torch
-from PIL import Image as PILImage
-from torchvision.transforms import v2
-from transformers import BaseImageProcessor
-from collections.abc import Mapping
-class CustomVisionImageProcessorV2(BaseImageProcessor):
-    """
-    ViT-B/16-224 preprocessing for Huggingface datasets.
-    Works with:
-        - dataset.map(..., batched=True/False)
-        - dataset.set_transform
-    """
-    # run-time hint used by the Hugging-Face Trainer / pipeline / collator to decide which fields to feed into the model’s forward pass
-    model_input_names = ["pixel_values"]
-    def __init__( self, size: int = 224, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), train: bool = True, **kwargs):
-        super().__init__(**kwargs)
-        self.size = size
-        self.mean = mean
-        self.std = std
-        self.train = train
-        # self.auto_map = { # add an entry in the processor config file for auto registring the processor
-        #     "AutoImageProcessor": "image_processor.CustomVisionImageProcessorV2",
-        # }
-    # Create transform lazily so it is NOT serialised when we save using ImageProcessor.save_pretrained
-    @property
-    def transform(self):
-        if not hasattr(self, "_transform"):
-            if self.train:
-                self._transform = v2.Compose(
-                    [
-                        v2.ToImage(),
-                        v2.RandomResizedCrop(
-                            size=(self.size, self.size),
-                            scale=(0.8, 1.0),
-                            antialias=True,
-                        ),
-                        v2.RandomHorizontalFlip(p=0.5),
-                        v2.ToDtype(torch.float32, scale=True),
-                        v2.Normalize(mean=self.mean, std=self.std),
-                    ]
-                )
-            else:
-                self._transform = v2.Compose(
-                    [
-                        v2.ToImage(),
-                        v2.Resize(size=(self.size, self.size), antialias=True),
-                        v2.CenterCrop(size=(self.size, self.size)),
-                        v2.ToDtype(torch.float32, scale=True),
-                        v2.Normalize(mean=self.mean, std=self.std),
-                    ]
-                )
-        return self._transform
-    def __call__(self,
-                 examples: PILImage.Image | List[PILImage.Image] | Dict[str, Any],
-                 image_feature_name: str="image",
-                 label_feature_name: str="label") -> Dict[str, torch.Tensor]:
-        """
-        Accepts:
-            * a single PIL.Image
-            * a list[PIL.Image]
-            * a dict with keys:
-                'image' : PIL.Image or list[PIL.Image]
-                'label' : single int or list[int]  (optional)
-        Returns:
-            dict with:
-                'pixel_values' : tensor (C, H, W)   or (N, C, H, W)
-                'labels'       : tensor (long)      or (N,)
-        """
-        if isinstance(examples, PILImage.Image): # single PIL image → wrap into list
-            images, labels = [examples], None
-        elif isinstance(examples, list): # list of PIL images
-            images, labels = examples, None
-        elif isinstance(examples, Mapping): # dict (single example or LazyBatch)
-            images = examples[image_feature_name]
-            labels = examples.get(label_feature_name)
-            if isinstance(images, PILImage.Image): # single example
-                images = [images]
-                labels = [labels] if labels is not None else None
-            # else images is already a list (LazyBatch)
-        else:
-            raise TypeError(f"Expected PIL.Image, list[PIL.Image] or dict, got {type(examples)}")
-        pixel_values = torch.stack([self.transform(img) for img in images])
-        if pixel_values.shape[0] == 1: # squeeze singleton batch dimension when we only processed one image
-            pixel_values = pixel_values.squeeze(0)
-        out = {"pixel_values": pixel_values}
-        if labels is not None:
-            labels_tensor = torch.tensor(labels, dtype=torch.long)
-            if labels_tensor.shape[0] == 1: # squeeze singleton batch dimension when we only processed one label
-                labels_tensor = labels_tensor.squeeze(0)
-            out["labels"] = labels_tensor
-        return out
-    def to_dict(self) -> Dict[str, Any]:
-        cfg = super().to_dict()
-        cfg.update(
-            dict(
-                auto_map={
-                    "AutoImageProcessor": "pre_processor.CustomVisionImageProcessorV2",
-                    # "AutoImageProcessor_fast": "pre_processor.CustomVisionImageProcessorV3Fast",
-                },
-            )
-        )
-        return cfg

models/preprocessor_config.json CHANGED Viewed

@@ -1,18 +1,23 @@
 {
-  "auto_map": {
-    "AutoImageProcessor": "pre_processor.CustomVisionImageProcessorV2"
-  },
-  "image_processor_type": "CustomVisionImageProcessorV2",
-  "mean": [
     0.5,
     0.5,
     0.5
   ],
-  "size": 224,
-  "std": [
     0.5,
     0.5,
     0.5
   ],
-  "train": false
 }

 {
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
     0.5,
     0.5,
     0.5
   ],
+  "image_processor_type": "ViTImageProcessor",
+  "image_std": [
     0.5,
     0.5,
     0.5
   ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
 }

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
 torch==2.8.0
-torchvision==0.23.0
 gradio==5.42.0
 transformers==4.55.2
 evaluate==0.4.5

 torch==2.8.0
 gradio==5.42.0
 transformers==4.55.2
 evaluate==0.4.5