|
|
import cv2 |
|
|
import inference |
|
|
from inference.core.utils.postprocess import cosine_similarity |
|
|
|
|
|
from inference.models import Clip |
|
|
clip = Clip() |
|
|
|
|
|
prompt = "an ace of spades playing card" |
|
|
text_embedding = clip.embed_text(prompt) |
|
|
|
|
|
def render(result, image): |
|
|
|
|
|
similarity = cosine_similarity(result["embeddings"][0], text_embedding[0]) |
|
|
|
|
|
|
|
|
range = (0.15, 0.40) |
|
|
similarity = (similarity-range[0])/(range[1]-range[0]) |
|
|
similarity = max(min(similarity, 1), 0)*100 |
|
|
|
|
|
|
|
|
text = f"{similarity:.1f}%" |
|
|
cv2.putText(image, text, (10, 310), cv2.FONT_HERSHEY_SIMPLEX, 12, (255, 255, 255), 30) |
|
|
cv2.putText(image, text, (10, 310), cv2.FONT_HERSHEY_SIMPLEX, 12, (206, 6, 103), 16) |
|
|
|
|
|
|
|
|
cv2.putText(image, prompt, (20, 1050), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 10) |
|
|
cv2.putText(image, prompt, (20, 1050), cv2.FONT_HERSHEY_SIMPLEX, 2, (206, 6, 103), 5) |
|
|
|
|
|
|
|
|
cv2.imshow("CLIP", image) |
|
|
cv2.waitKey(1) |
|
|
|
|
|
|
|
|
inference.Stream( |
|
|
source="webcam", |
|
|
model=clip, |
|
|
|
|
|
output_channel_order="BGR", |
|
|
use_main_thread=True, |
|
|
|
|
|
on_prediction=render |
|
|
) |
|
|
|