uf-aice-lab
/

Multimodal_ViT_L14

Model card Files Files and versions

xet

Community

uf-aice-lab commited on Jun 24, 2025

Commit

cb7a916

verified ·

1 Parent(s): 59a8686

Update README.md

Browse files

Files changed (1) hide show

README.md +35 -18

README.md CHANGED Viewed

@@ -52,14 +52,14 @@ import torch, open_clip
 from PIL import Image
 # ---------- 0. Paths ----------
-root_dir   = "./"
-clip_ckpt  = f"{root_dir}/full_clip.pt"
-reg_ts     = f"{root_dir}/reg_head.ts"
-pp_ckpt    = f"{root_dir}/preprocess.pt"
-image_path = "path_to_image"
-text_prompt= "text_prompt_of_question_and_student_answer"
-device     = "cuda" if torch.cuda.is_available() else "cpu"
 # ---------- 1. CLIP Backbone ----------
 clip_name = "ViT-L-14"
@@ -67,26 +67,43 @@ clip = open_clip.create_model(clip_name, device=device, pretrained=None)
 clip.load_state_dict(torch.load(clip_ckpt, map_location=device), strict=False)
 clip.eval(); clip.requires_grad_(False)
-# ---------- 2. Regression Head (TorchScript) ----------
 reg_head = torch.jit.load(reg_ts, map_location=device)
 reg_head.eval()
-# ---------- 3. Preprocess ----------
 preprocess = torch.load(pp_ckpt, weights_only=False)
-# ---------- 4. Inference ----------
-img = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
-tok = open_clip.tokenize([text_prompt], context_length=clip.context_length).to(device)
 with torch.no_grad():
-    img_emb = torch.nn.functional.normalize(clip.encode_image(img), dim=1)
-    txt_emb = torch.nn.functional.normalize(clip.encode_text(tok), dim=1)
-cos_sim = (img_emb @ txt_emb.T).item()
-score   = reg_head(img_emb).item()
-print(f"Cosine similarity: {cos_sim:.4f}")
-print(f"Regression score : {score:.4f}")
 ```
 ---

 from PIL import Image
 # ---------- 0. Paths ----------
+root_dir      = "./"
+clip_ckpt     = f"{root_dir}/full_clip.pt"
+reg_ts        = f"{root_dir}/reg_head.ts"
+pp_ckpt       = f"{root_dir}/preprocess.pt"
+image_paths   = ["path_to_image1", "path_to_image2"]
+text_prompts  = ["prompt_q+ans_1", "prompt_q+ans_2"]
+device        = "cuda" if torch.cuda.is_available() else "cpu"
 # ---------- 1. CLIP Backbone ----------
 clip_name = "ViT-L-14"
 clip.load_state_dict(torch.load(clip_ckpt, map_location=device), strict=False)
 clip.eval(); clip.requires_grad_(False)
+# ---------- 2. Regression Head ----------
 reg_head = torch.jit.load(reg_ts, map_location=device)
 reg_head.eval()
+# ---------- 3. Pre-processing ----------
 preprocess = torch.load(pp_ckpt, weights_only=False)
+# ---------- 4. Inference (batch size = 2) ----------
+#   4-1. Build image & text batches
+imgs = torch.stack(
+    [preprocess(Image.open(p).convert("RGB")) for p in image_paths]
+).to(device)                                                  # (2, 3, H, W)
+toks = open_clip.tokenize(
+    text_prompts, context_length=clip.context_length
+).to(device)                                                  # (2, ctx_len)
 with torch.no_grad():
+    #----- Encode -----
+    img_emb = torch.nn.functional.normalize(clip.encode_image(imgs), dim=1)  # (2, D)
+    txt_emb = torch.nn.functional.normalize(clip.encode_text(toks), dim=1)   # (2, D)
+    #----- Fuse image & text into one embedding per sample -----
+    # Here we simply average the two L2-normalised vectors, then renormalise.
+    fused_emb = torch.nn.functional.normalize(img_emb + txt_emb, dim=1)      # (2, D)
+    #----- Similarity between the two fused samples -----
+    # fused_emb[0] · fused_emb[1]  (equivalent to (fused_emb @ fused_emb.T)[0,1])
+    pair_sim = (fused_emb[0] * fused_emb[1]).sum().item()
+    #----- Regression scores (unchanged) -----
+    scores = reg_head(img_emb).squeeze(-1)                                   # (2,)
+# ---------- 5. Output ----------
+print(f"Cosine similarity between sample-1 and sample-2: {pair_sim:.4f}\n")
+print(f"[Sample 1] Regression score: {scores[0]:.4f}")
+print(f"[Sample 2] Regression score: {scores[1]:.4f}")
 ```
 ---