uf-aice-lab commited on
Commit
cb7a916
verified
1 Parent(s): 59a8686

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +35 -18
README.md CHANGED
@@ -52,14 +52,14 @@ import torch, open_clip
52
  from PIL import Image
53
 
54
  # ---------- 0. Paths ----------
55
- root_dir = "./"
56
- clip_ckpt = f"{root_dir}/full_clip.pt"
57
- reg_ts = f"{root_dir}/reg_head.ts"
58
- pp_ckpt = f"{root_dir}/preprocess.pt"
59
 
60
- image_path = "path_to_image"
61
- text_prompt= "text_prompt_of_question_and_student_answer"
62
- device = "cuda" if torch.cuda.is_available() else "cpu"
63
 
64
  # ---------- 1. CLIP Backbone ----------
65
  clip_name = "ViT-L-14"
@@ -67,26 +67,43 @@ clip = open_clip.create_model(clip_name, device=device, pretrained=None)
67
  clip.load_state_dict(torch.load(clip_ckpt, map_location=device), strict=False)
68
  clip.eval(); clip.requires_grad_(False)
69
 
70
- # ---------- 2. Regression Head (TorchScript) ----------
71
  reg_head = torch.jit.load(reg_ts, map_location=device)
72
  reg_head.eval()
73
 
74
- # ---------- 3. Preprocess ----------
75
  preprocess = torch.load(pp_ckpt, weights_only=False)
76
 
77
- # ---------- 4. Inference ----------
78
- img = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
79
- tok = open_clip.tokenize([text_prompt], context_length=clip.context_length).to(device)
 
 
 
 
 
 
80
 
81
  with torch.no_grad():
82
- img_emb = torch.nn.functional.normalize(clip.encode_image(img), dim=1)
83
- txt_emb = torch.nn.functional.normalize(clip.encode_text(tok), dim=1)
 
 
 
 
 
 
 
 
 
84
 
85
- cos_sim = (img_emb @ txt_emb.T).item()
86
- score = reg_head(img_emb).item()
87
 
88
- print(f"Cosine similarity: {cos_sim:.4f}")
89
- print(f"Regression score : {score:.4f}")
 
 
90
  ```
91
 
92
  ---
 
52
  from PIL import Image
53
 
54
  # ---------- 0. Paths ----------
55
+ root_dir = "./"
56
+ clip_ckpt = f"{root_dir}/full_clip.pt"
57
+ reg_ts = f"{root_dir}/reg_head.ts"
58
+ pp_ckpt = f"{root_dir}/preprocess.pt"
59
 
60
+ image_paths = ["path_to_image1", "path_to_image2"]
61
+ text_prompts = ["prompt_q+ans_1", "prompt_q+ans_2"]
62
+ device = "cuda" if torch.cuda.is_available() else "cpu"
63
 
64
  # ---------- 1. CLIP Backbone ----------
65
  clip_name = "ViT-L-14"
 
67
  clip.load_state_dict(torch.load(clip_ckpt, map_location=device), strict=False)
68
  clip.eval(); clip.requires_grad_(False)
69
 
70
+ # ---------- 2. Regression Head ----------
71
  reg_head = torch.jit.load(reg_ts, map_location=device)
72
  reg_head.eval()
73
 
74
+ # ---------- 3. Pre-processing ----------
75
  preprocess = torch.load(pp_ckpt, weights_only=False)
76
 
77
+ # ---------- 4. Inference (batch size = 2) ----------
78
+ # 4-1. Build image & text batches
79
+ imgs = torch.stack(
80
+ [preprocess(Image.open(p).convert("RGB")) for p in image_paths]
81
+ ).to(device) # (2, 3, H, W)
82
+
83
+ toks = open_clip.tokenize(
84
+ text_prompts, context_length=clip.context_length
85
+ ).to(device) # (2, ctx_len)
86
 
87
  with torch.no_grad():
88
+ #----- Encode -----
89
+ img_emb = torch.nn.functional.normalize(clip.encode_image(imgs), dim=1) # (2, D)
90
+ txt_emb = torch.nn.functional.normalize(clip.encode_text(toks), dim=1) # (2, D)
91
+
92
+ #----- Fuse image & text into one embedding per sample -----
93
+ # Here we simply average the two L2-normalised vectors, then renormalise.
94
+ fused_emb = torch.nn.functional.normalize(img_emb + txt_emb, dim=1) # (2, D)
95
+
96
+ #----- Similarity between the two fused samples -----
97
+ # fused_emb[0] 路 fused_emb[1] (equivalent to (fused_emb @ fused_emb.T)[0,1])
98
+ pair_sim = (fused_emb[0] * fused_emb[1]).sum().item()
99
 
100
+ #----- Regression scores (unchanged) -----
101
+ scores = reg_head(img_emb).squeeze(-1) # (2,)
102
 
103
+ # ---------- 5. Output ----------
104
+ print(f"Cosine similarity between sample-1 and sample-2: {pair_sim:.4f}\n")
105
+ print(f"[Sample 1] Regression score: {scores[0]:.4f}")
106
+ print(f"[Sample 2] Regression score: {scores[1]:.4f}")
107
  ```
108
 
109
  ---