Spaces:

lillab-demos
/

cogen

Sleeping

App Files Files Community

momergul commited on Sep 20, 2024

Commit

5f8e458

1 Parent(s): 18e7d92

Tweaked inference

Browse files

Files changed (1) hide show

app.py +40 -21

app.py CHANGED Viewed

@@ -23,7 +23,7 @@ css="""
 def initialize_game() -> List[List[str]]:
     context_dicts = [generate_complete_game() for _ in range(2)]
-    roles = ["speaker"] * 3 + ["listener"] * 3
     speaker_images = []
     listener_images = []
     targets = []
@@ -36,46 +36,64 @@ def initialize_game() -> List[List[str]]:
     return list(zip(speaker_images, listener_images, targets, roles))
-@spaces.GPU
 def get_model_response(
         model, adapter_name, processor, index_to_token, role: str,
         image_paths: List[str], user_message: str = "", target_image: str = ""
 ) -> str:
     model.model.set_adapter(adapter_name)
-    print(model.model.active_adapter)
     if role == "speaker":
         img_dir = "tangram_pngs"
         input_tokens, attn_mask, images, image_attn_mask, label = joint_speaker_input(
             processor, image_paths, target_image, model.get_listener().device
         )
-        print("Hi")
-        with torch.no_grad():
-            image_paths = [image_paths]
-            captions, _, _, _, _ = model.generate(
-                images, input_tokens, attn_mask, image_attn_mask, label,
-                image_paths, processor, img_dir, index_to_token,
-                max_steps=30, sampling_type="nucleus", temperature=0.7,
-                top_k=50, top_p=1, repetition_penalty=1, num_samples=5
-            )
-        print("There")
         response = captions[0]
     else:  # listener
         images, l_input_tokens, l_attn_mask, l_image_attn_mask, s_input_tokens, s_attn_mask, \
             s_image_attn_mask, s_target_mask, s_target_label = joint_listener_input(
                 processor, image_paths, user_message, model.get_listener().device
             )
-        with torch.no_grad():
-            # Forward
-            _, _, joint_log_probs = model.comprehension_side([
-                images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
-                s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label,
-            ])
-            target_idx = joint_log_probs[0].argmax().item()
-            response = image_paths[target_idx]
     return response
 def interaction(model, processor, index_to_token, model_iteration: str) -> Tuple[List[str], List[str]]:
     image_role_pairs = initialize_game()
     conversation = []
@@ -195,6 +213,7 @@ def create_app():
         processor = get_processor()
         index_to_token = get_index_to_token()
         def start_interaction(model_iteration):
             if model_iteration is None:
                 return [], "Please select a model iteration.", "", "", "", gr.update(interactive=False), \

 def initialize_game() -> List[List[str]]:
     context_dicts = [generate_complete_game() for _ in range(2)]
+    roles = ["listener"] * 3 + ["speaker"] * 3
     speaker_images = []
     listener_images = []
     targets = []
     return list(zip(speaker_images, listener_images, targets, roles))
 def get_model_response(
         model, adapter_name, processor, index_to_token, role: str,
         image_paths: List[str], user_message: str = "", target_image: str = ""
 ) -> str:
     model.model.set_adapter(adapter_name)
     if role == "speaker":
         img_dir = "tangram_pngs"
+        print("Starting processing")
         input_tokens, attn_mask, images, image_attn_mask, label = joint_speaker_input(
             processor, image_paths, target_image, model.get_listener().device
         )
+        image_paths = [image_paths]
+        print("Starting inference")
+        captions = get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths,
+                                        processor, img_dir, index_to_token)
+        print("Done")
         response = captions[0]
     else:  # listener
+        print("Starting processing")
         images, l_input_tokens, l_attn_mask, l_image_attn_mask, s_input_tokens, s_attn_mask, \
             s_image_attn_mask, s_target_mask, s_target_label = joint_listener_input(
                 processor, image_paths, user_message, model.get_listener().device
             )
+        print("Starting inference")
+        response = get_listener_response(
+            model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
+            s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths
+        )
+        print("Done")
     return response
+@spaces.GPU(duration=20)
+def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token):
+    model = model.cuda()
+    with torch.no_grad():
+        captions, _, _, _, _ = model.generate(
+            images.cuda(), input_tokens.cuda(), attn_mask.cuda(), image_attn_mask.cuda(), label.cuda(),
+            image_paths, processor, img_dir, index_to_token,
+            max_steps=30, sampling_type="nucleus", temperature=0.7,
+            top_k=50, top_p=1, repetition_penalty=1, num_samples=5
+        )
+    return captions
+@spaces.GPU(duration=20)
+def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
+                          s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths):
+    model = model.cuda()
+    with torch.no_grad():
+        _, _, joint_log_probs = model.comprehension_side([
+            images.cuda(), l_input_tokens.cuda(), l_attn_mask.cuda(), l_image_attn_mask.cuda(), index_to_token,
+            s_input_tokens.cuda(), s_attn_mask.cuda(), s_image_attn_mask.cuda(), s_target_mask.cuda(), s_target_label.cuda(),
+        ])
+        target_idx = joint_log_probs[0].argmax().item()
+        response = image_paths[target_idx]
+    return response
 def interaction(model, processor, index_to_token, model_iteration: str) -> Tuple[List[str], List[str]]:
     image_role_pairs = initialize_game()
     conversation = []
         processor = get_processor()
         index_to_token = get_index_to_token()
+        print("Heyo!")
         def start_interaction(model_iteration):
             if model_iteration is None:
                 return [], "Please select a model iteration.", "", "", "", gr.update(interactive=False), \