import math from PIL import Image from sentence_transformers import SentenceTransformer def resize_image_for_patch(image: Image.Image, patch_size: int = 14, max_patches: int = 400) -> Image.Image: orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height # Max width and height in pixels under the patch constraint max_total_pixels = patch_size * math.sqrt(max_patches) if aspect_ratio >= 1: # Landscape or square orientation target_width = patch_size * int(math.floor(math.sqrt(max_patches * aspect_ratio))) target_height = int(target_width / aspect_ratio) else: # Portrait orientation target_height = patch_size * int(math.floor(math.sqrt(max_patches / aspect_ratio))) target_width = int(target_height * aspect_ratio) # Ensure dimensions are multiples of patch_size target_width -= target_width % patch_size target_height -= target_height % patch_size return image.resize((target_width, target_height), Image.BICUBIC) # Init model model = SentenceTransformer("./", device="cuda") images = [ resize_image_for_patch(Image.open("/home/aki0421/Share/images/00085.png")) ] image_embeddings = model.encode(images, convert_to_tensor=True) sentences = [ "女の子が悲しんでいる。", "落ち込んでる人", "泣いている", "笑っている", "ピンクの髪の女の子", "赤い髪の女の子", "茶色の髪の女の子", "赤い目", "青い目", "曇っている", "雨が降っている", "晴れている", "キッチンにいます。", "学校にいる", "魔法少女のようだ", "戦闘しますか?", "男性ですか?", "茶色い髪の女の子が悲しんでいるシーン", "ピンクの髪の女の子が笑っているシーン" ] text_embeddings = model.encode(sentences, convert_to_tensor=True) similarities = model.similarity(text_embeddings, image_embeddings) print(similarities)