sergioska commited on
Commit
1098987
·
1 Parent(s): 467bcef

add a button to apply tags

Browse files
Files changed (1) hide show
  1. app.py +21 -14
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import torch
2
  import streamlit as st
3
  import numpy as np
4
- from PIL import Image
5
  from transformers import pipeline
6
  from transformers import OwlViTProcessor, OwlViTForObjectDetection
7
  from tempfile import NamedTemporaryFile
@@ -10,9 +10,10 @@ audiopipe = pipeline("automatic-speech-recognition", model="openai/whisper-large
10
  #imagepipe = pipeline("image-classification", model="Kaludi/food-category-classification-v2.0")
11
  #imagepipe = pipeline("image-classification", model="nateraw/food")
12
  imagepipe = pipeline("image-classification", model="flatmoon102/fruits_and_vegetables_image_classification")
 
13
 
14
- processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
15
- model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
16
 
17
  st.title('Upload an audio file for speech recognition')
18
 
@@ -45,15 +46,21 @@ if uploaded_image_zero_file is not None:
45
 
46
  if st.button('apply tag'):
47
  tags = [['eggs', 'apple', 'pear']]
48
- inputs = processor(text=tags, images=image, return_tensors="pt")
49
- outputs = model(**inputs)
50
- target_sizes = torch.Tensor([image.size[::-1]])
51
- results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
52
- i = 0 # Retrieve predictions for the first image for the corresponding text queries
53
- text = tags[i]
54
- boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
55
- st.write(results)
 
 
 
 
56
  # Print detected objects and rescaled box coordinates
57
- for box, score, label in zip(boxes, scores, labels):
58
- box = [round(i, 2) for i in box.tolist()]
59
- print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
 
 
 
1
  import torch
2
  import streamlit as st
3
  import numpy as np
4
+ from PIL import Image, ImageDraw
5
  from transformers import pipeline
6
  from transformers import OwlViTProcessor, OwlViTForObjectDetection
7
  from tempfile import NamedTemporaryFile
 
10
  #imagepipe = pipeline("image-classification", model="Kaludi/food-category-classification-v2.0")
11
  #imagepipe = pipeline("image-classification", model="nateraw/food")
12
  imagepipe = pipeline("image-classification", model="flatmoon102/fruits_and_vegetables_image_classification")
13
+ detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
14
 
15
+ #processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
16
+ #model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
17
 
18
  st.title('Upload an audio file for speech recognition')
19
 
 
46
 
47
  if st.button('apply tag'):
48
  tags = [['eggs', 'apple', 'pear']]
49
+ #inputs = processor(text=tags, images=image, return_tensors="pt")
50
+ #outputs = model(**inputs)
51
+ #target_sizes = torch.Tensor([image.size[::-1]])
52
+ #results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
53
+ predictions = detector(
54
+ image,
55
+ candidate_labels=['eggs', 'apple', 'pear']
56
+ )
57
+ #i = 0 # Retrieve predictions for the first image for the corresponding text queries
58
+ #text = tags[i]
59
+ #boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
60
+ st.write(predictions)
61
  # Print detected objects and rescaled box coordinates
62
+ #draw = ImageDraw.Draw(image)
63
+ #for box, score, label in zip(boxes, scores, labels):
64
+ # box = [round(i, 2) for i in box.tolist()]
65
+ # print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
66
+