Spaces:

sergioska
/

image-classifier

Runtime error

App Files Files Community

sergioska commited on Dec 2, 2023

Commit

1098987

1 Parent(s): 467bcef

add a button to apply tags

Browse files

Files changed (1) hide show

app.py +21 -14

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import streamlit as st
 import numpy as np
-from PIL import Image
 from transformers import pipeline
 from transformers import OwlViTProcessor, OwlViTForObjectDetection
 from tempfile import NamedTemporaryFile
@@ -10,9 +10,10 @@ audiopipe = pipeline("automatic-speech-recognition", model="openai/whisper-large
 #imagepipe = pipeline("image-classification", model="Kaludi/food-category-classification-v2.0")
 #imagepipe = pipeline("image-classification", model="nateraw/food")
 imagepipe = pipeline("image-classification", model="flatmoon102/fruits_and_vegetables_image_classification")
-processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
-model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
 st.title('Upload an audio file for speech recognition')
@@ -45,15 +46,21 @@ if uploaded_image_zero_file is not None:
 if st.button('apply tag'):
     tags = [['eggs', 'apple', 'pear']]
-    inputs = processor(text=tags, images=image, return_tensors="pt")
-    outputs = model(**inputs)
-    target_sizes = torch.Tensor([image.size[::-1]])
-    results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
-    i = 0  # Retrieve predictions for the first image for the corresponding text queries
-    text = tags[i]
-    boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
-    st.write(results)
     # Print detected objects and rescaled box coordinates
-    for box, score, label in zip(boxes, scores, labels):
-        box = [round(i, 2) for i in box.tolist()]
-        print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")

 import torch
 import streamlit as st
 import numpy as np
+from PIL import Image, ImageDraw
 from transformers import pipeline
 from transformers import OwlViTProcessor, OwlViTForObjectDetection
 from tempfile import NamedTemporaryFile
 #imagepipe = pipeline("image-classification", model="Kaludi/food-category-classification-v2.0")
 #imagepipe = pipeline("image-classification", model="nateraw/food")
 imagepipe = pipeline("image-classification", model="flatmoon102/fruits_and_vegetables_image_classification")
+detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+#processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+#model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
 st.title('Upload an audio file for speech recognition')
 if st.button('apply tag'):
     tags = [['eggs', 'apple', 'pear']]
+    #inputs = processor(text=tags, images=image, return_tensors="pt")
+    #outputs = model(**inputs)
+    #target_sizes = torch.Tensor([image.size[::-1]])
+    #results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
+    predictions = detector(
+        image,
+        candidate_labels=['eggs', 'apple', 'pear']
+    )
+    #i = 0  # Retrieve predictions for the first image for the corresponding text queries
+    #text = tags[i]
+    #boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+    st.write(predictions)
     # Print detected objects and rescaled box coordinates
+    #draw = ImageDraw.Draw(image)
+    #for box, score, label in zip(boxes, scores, labels):
+    #    box = [round(i, 2) for i in box.tolist()]
+    #    print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")