Spaces:
Runtime error
Runtime error
add a button to apply tags
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import torch
|
| 2 |
import streamlit as st
|
| 3 |
import numpy as np
|
| 4 |
-
from PIL import Image
|
| 5 |
from transformers import pipeline
|
| 6 |
from transformers import OwlViTProcessor, OwlViTForObjectDetection
|
| 7 |
from tempfile import NamedTemporaryFile
|
|
@@ -10,9 +10,10 @@ audiopipe = pipeline("automatic-speech-recognition", model="openai/whisper-large
|
|
| 10 |
#imagepipe = pipeline("image-classification", model="Kaludi/food-category-classification-v2.0")
|
| 11 |
#imagepipe = pipeline("image-classification", model="nateraw/food")
|
| 12 |
imagepipe = pipeline("image-classification", model="flatmoon102/fruits_and_vegetables_image_classification")
|
|
|
|
| 13 |
|
| 14 |
-
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
|
| 15 |
-
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
|
| 16 |
|
| 17 |
st.title('Upload an audio file for speech recognition')
|
| 18 |
|
|
@@ -45,15 +46,21 @@ if uploaded_image_zero_file is not None:
|
|
| 45 |
|
| 46 |
if st.button('apply tag'):
|
| 47 |
tags = [['eggs', 'apple', 'pear']]
|
| 48 |
-
inputs = processor(text=tags, images=image, return_tensors="pt")
|
| 49 |
-
outputs = model(**inputs)
|
| 50 |
-
target_sizes = torch.Tensor([image.size[::-1]])
|
| 51 |
-
results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
# Print detected objects and rescaled box coordinates
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
import streamlit as st
|
| 3 |
import numpy as np
|
| 4 |
+
from PIL import Image, ImageDraw
|
| 5 |
from transformers import pipeline
|
| 6 |
from transformers import OwlViTProcessor, OwlViTForObjectDetection
|
| 7 |
from tempfile import NamedTemporaryFile
|
|
|
|
| 10 |
#imagepipe = pipeline("image-classification", model="Kaludi/food-category-classification-v2.0")
|
| 11 |
#imagepipe = pipeline("image-classification", model="nateraw/food")
|
| 12 |
imagepipe = pipeline("image-classification", model="flatmoon102/fruits_and_vegetables_image_classification")
|
| 13 |
+
detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
|
| 14 |
|
| 15 |
+
#processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
|
| 16 |
+
#model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
|
| 17 |
|
| 18 |
st.title('Upload an audio file for speech recognition')
|
| 19 |
|
|
|
|
| 46 |
|
| 47 |
if st.button('apply tag'):
|
| 48 |
tags = [['eggs', 'apple', 'pear']]
|
| 49 |
+
#inputs = processor(text=tags, images=image, return_tensors="pt")
|
| 50 |
+
#outputs = model(**inputs)
|
| 51 |
+
#target_sizes = torch.Tensor([image.size[::-1]])
|
| 52 |
+
#results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
|
| 53 |
+
predictions = detector(
|
| 54 |
+
image,
|
| 55 |
+
candidate_labels=['eggs', 'apple', 'pear']
|
| 56 |
+
)
|
| 57 |
+
#i = 0 # Retrieve predictions for the first image for the corresponding text queries
|
| 58 |
+
#text = tags[i]
|
| 59 |
+
#boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
|
| 60 |
+
st.write(predictions)
|
| 61 |
# Print detected objects and rescaled box coordinates
|
| 62 |
+
#draw = ImageDraw.Draw(image)
|
| 63 |
+
#for box, score, label in zip(boxes, scores, labels):
|
| 64 |
+
# box = [round(i, 2) for i in box.tolist()]
|
| 65 |
+
# print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
|
| 66 |
+
|