|
|
import math |
|
|
from PIL import Image |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
def resize_image_for_patch(image: Image.Image, patch_size: int = 14, max_patches: int = 400) -> Image.Image: |
|
|
orig_width, orig_height = image.size |
|
|
aspect_ratio = orig_width / orig_height |
|
|
|
|
|
|
|
|
max_total_pixels = patch_size * math.sqrt(max_patches) |
|
|
|
|
|
if aspect_ratio >= 1: |
|
|
|
|
|
target_width = patch_size * int(math.floor(math.sqrt(max_patches * aspect_ratio))) |
|
|
target_height = int(target_width / aspect_ratio) |
|
|
else: |
|
|
|
|
|
target_height = patch_size * int(math.floor(math.sqrt(max_patches / aspect_ratio))) |
|
|
target_width = int(target_height * aspect_ratio) |
|
|
|
|
|
|
|
|
target_width -= target_width % patch_size |
|
|
target_height -= target_height % patch_size |
|
|
|
|
|
return image.resize((target_width, target_height), Image.BICUBIC) |
|
|
|
|
|
|
|
|
model = SentenceTransformer("./", device="cuda") |
|
|
|
|
|
images = [ |
|
|
resize_image_for_patch(Image.open("/home/aki0421/Share/images/00085.png")) |
|
|
] |
|
|
image_embeddings = model.encode(images, convert_to_tensor=True) |
|
|
|
|
|
sentences = [ |
|
|
"女の子が悲しんでいる。", |
|
|
"落ち込んでる人", |
|
|
"泣いている", |
|
|
"笑っている", |
|
|
"ピンクの髪の女の子", |
|
|
"赤い髪の女の子", |
|
|
"茶色の髪の女の子", |
|
|
"赤い目", |
|
|
"青い目", |
|
|
"曇っている", |
|
|
"雨が降っている", |
|
|
"晴れている", |
|
|
"キッチンにいます。", |
|
|
"学校にいる", |
|
|
"魔法少女のようだ", |
|
|
"戦闘しますか?", |
|
|
"男性ですか?", |
|
|
"茶色い髪の女の子が悲しんでいるシーン", |
|
|
"ピンクの髪の女の子が笑っているシーン" |
|
|
] |
|
|
text_embeddings = model.encode(sentences, convert_to_tensor=True) |
|
|
similarities = model.similarity(text_embeddings, image_embeddings) |
|
|
|
|
|
print(similarities) |