sergioska's picture
add a button to apply tags
4e2deeb
raw
history blame
2.53 kB
import torch
import streamlit as st
import numpy as np
from PIL import Image
from transformers import pipeline
from transformers import OwlViTProcessor, OwlViTForObjectDetection
from tempfile import NamedTemporaryFile
audiopipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
#imagepipe = pipeline("image-classification", model="Kaludi/food-category-classification-v2.0")
#imagepipe = pipeline("image-classification", model="nateraw/food")
imagepipe = pipeline("image-classification", model="flatmoon102/fruits_and_vegetables_image_classification")
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
st.title('Upload an audio file for speech recognition')
uploaded_audio_file = st.file_uploader("Choose an audio file (wav)")
if uploaded_audio_file is not None:
with NamedTemporaryFile(suffix="wav") as temp:
temp.write(uploaded_audio_file.getvalue())
temp.seek(0)
result = audiopipe(temp.name)
st.write(result)
st.title('Upload an image file to classification (food)')
uploaded_image_file = st.file_uploader("Choose an image file")
if uploaded_image_file is not None:
with NamedTemporaryFile() as temp:
temp.write(uploaded_image_file.getvalue())
temp.seek(0)
result = imagepipe(temp.name)
st.write(result)
st.title('Upload an image file to detection')
uploaded_image_zero_file = st.file_uploader("Choose an image file (zero)")
texts = st.text_input('tags')
if uploaded_image_zero_file is not None:
image = Image.open(uploaded_image_zero_file)
outputImage = np.array(image)
st.image(outputImage)
if st.button('apply tag'):
tags = [texts.split(", ")]
inputs = processor(text=tags, images=image, return_tensors="pt")
outputs = model(**inputs)
target_sizes = torch.Tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
i = 0 # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
st.write(results)
# Print detected objects and rescaled box coordinates
for box, score, label in zip(boxes, scores, labels):
box = [round(i, 2) for i in box.tolist()]
print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")