Spaces:

lbw18601752667
/

IDMR-demo

Running

App Files Files Community

liubangwei commited on Mar 25

Commit

3ee7efa

1 Parent(s): d37eb96

fix retrieval

Browse files

Files changed (8) hide show

app.py +23 -16
image_embeddings.pkl +2 -2
src/__pycache__/arguments.cpython-310.pyc +0 -0
src/__pycache__/model.cpython-310.pyc +0 -0
src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc +0 -0
src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc +0 -0
src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc +0 -0
src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc +0 -0

app.py CHANGED Viewed

@@ -22,8 +22,8 @@ def load_model():
     global IMAGE_TOKEN
     model_args = ModelArguments(
-        # model_name="/fs-computility/ai-shen/kilab-shared/liubangwei/ckpt/my_hf/IDMR-2B",
-        model_name="lbw18601752667/IDMR-2B",
         model_backbone="internvl_2_5",
     )
@@ -81,15 +81,16 @@ def get_inputs(processor, text, image_path=None, image=None):
     return inputs
 def encode_image_library(image_paths):
-    embeddings = []
     for img_path in image_paths:
         text = f"{IMAGE_TOKEN}\n Represent the given image."
         print(f"text: {text}")
         inputs = get_inputs(processor, text, image_path=img_path)
         with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
             output = model(tgt=inputs)
-        embeddings.append(output["tgt_reps"].float().cpu().numpy())
-    return np.stack(embeddings)
 def save_embeddings(embeddings, file_path="image_embeddings.pkl"):
     with open(file_path, "wb") as f:
@@ -115,22 +116,26 @@ def retrieve_images(query_text, query_image, top_n=TOP_N):
         image = None
     inputs = get_inputs(processor, query_text, image=image)
     print(f"inputs: {inputs}")
-    # with torch.no_grad():
     with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
         query_embedding = model(qry=inputs)["qry_reps"].float().cpu().numpy()
-    embeddings = load_embeddings()
     similarity = cosine_similarity(query_embedding, embeddings)
     similarity = similarity.T
     print(f"cosine_similarity: {similarity}")
     top_indices = np.argsort(-similarity).squeeze(0)[:top_n]
     print(f"top_indices: {top_indices}")
-    # similarity = model.compute_similarity(np.expand_dims(query_embedding.squeeze(0), axis=1), embeddings.squeeze(1))
-    # print(f"model.compute_similarity: {similarity}")
-    return [image_paths[i] for i in top_indices]
 def demo(query_text, query_image):
     # print(f"query_text: {query_text}, query_image: {query_image}, type(query_image): {type(query_image)}, image shape: {query_image.shape if query_image is not None else 'None'}")
@@ -157,13 +162,15 @@ def load_examples():
 iface = gr.Interface(
     fn=demo,
-    inputs=["text", "image"],
-    outputs=gr.Gallery(label=f"Retrieved Images (Top {TOP_N})"),
     examples=load_examples(),
-    title="Multimodal Retrieval Demo",
-    description="Enter a query and upload an image to retrieve relevant images from the library. You can click on the example below to use it as a query"
 )
 if not os.path.exists("image_embeddings.pkl"):
     embeddings = encode_image_library(image_paths)
     save_embeddings(embeddings)

     global IMAGE_TOKEN
     model_args = ModelArguments(
+        model_name="/fs-computility/ai-shen/kilab-shared/liubangwei/ckpt/my_hf/IDMR-2B",
+        # model_name="lbw18601752667/IDMR-2B",
         model_backbone="internvl_2_5",
     )
     return inputs
 def encode_image_library(image_paths):
+    embeddings_dict = {}
     for img_path in image_paths:
         text = f"{IMAGE_TOKEN}\n Represent the given image."
         print(f"text: {text}")
         inputs = get_inputs(processor, text, image_path=img_path)
         with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
             output = model(tgt=inputs)
+        img_name = os.path.basename(img_path)
+        embeddings_dict[img_name] = output["tgt_reps"].float().cpu().numpy()
+    return embeddings_dict
 def save_embeddings(embeddings, file_path="image_embeddings.pkl"):
     with open(file_path, "wb") as f:
         image = None
     inputs = get_inputs(processor, query_text, image=image)
     print(f"inputs: {inputs}")
     with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
         query_embedding = model(qry=inputs)["qry_reps"].float().cpu().numpy()
+    embeddings_dict = load_embeddings()
+    img_names = []
+    embeddings = []
+    for img_name in os.listdir(IMAGE_DIR):
+        if img_name in embeddings_dict:
+            img_names.append(img_name)
+            embeddings.append(embeddings_dict[img_name])
+    embeddings = np.stack(embeddings)
     similarity = cosine_similarity(query_embedding, embeddings)
     similarity = similarity.T
     print(f"cosine_similarity: {similarity}")
     top_indices = np.argsort(-similarity).squeeze(0)[:top_n]
     print(f"top_indices: {top_indices}")
+    return [os.path.join(IMAGE_DIR, img_names[i]) for i in top_indices]
 def demo(query_text, query_image):
     # print(f"query_text: {query_text}, query_image: {query_image}, type(query_image): {type(query_image)}, image shape: {query_image.shape if query_image is not None else 'None'}")
 iface = gr.Interface(
     fn=demo,
+    inputs=[
+        gr.Textbox(placeholder="Enter your query text here...", label="Query Text"),
+        gr.Image(label="Query Image", type="numpy")
+    ],
+    outputs=gr.Gallery(label=f"Retrieved Images (Top {TOP_N})", columns=3),
     examples=load_examples(),
+    title="Instance-Driven Multi-modal Retrieval (IDMR) Demo",
+    description="Enter a query text or upload an image to retrieve relevant images from the library. You can click on the examples below to try them out."
 )
 if not os.path.exists("image_embeddings.pkl"):
     embeddings = encode_image_library(image_paths)
     save_embeddings(embeddings)

image_embeddings.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b8dcedaab4e3bcc555795f56b15a7d830b74ffc707260c3b0152ba8d99a992bd
-size 409764

 version https://git-lfs.github.com/spec/v1
+oid sha256:05be7a8cbfdc77b64e473f3d2c30c47c6e522623abb11d288fb54fad07a5f8da
+size 412525

src/__pycache__/arguments.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/arguments.cpython-310.pyc and b/src/__pycache__/arguments.cpython-310.pyc differ

src/__pycache__/model.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/model.cpython-310.pyc and b/src/__pycache__/model.cpython-310.pyc differ

src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc CHANGED Viewed

Binary files a/src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc and b/src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc differ

src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc CHANGED Viewed

Binary files a/src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc and b/src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc differ

src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc and b/src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc differ

src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc CHANGED Viewed

Binary files a/src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc and b/src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc differ