Spaces:

cp524
/

smc_meissonic

Sleeping

App Files Files Community

cp524 commited on Oct 13

Commit

b49998b

1 Parent(s): 3e0672b

Fix device load race condition bug

Browse files

Files changed (3) hide show

src/smc/inference.py +11 -11
src/smc/rewards.py +3 -2
src/smc/scorers/image_reward_utils.py +7 -1

src/smc/inference.py CHANGED Viewed

@@ -24,8 +24,8 @@ MIN_GPU_DURATION = 60
 pipe_build_lock = threading.Lock()
-pipe_load_lock = threading.Lock()
-reward_model_load_lock = threading.Lock()
 lora_load_lock = threading.Lock()
@@ -83,11 +83,11 @@ def _get_pretrained_duration(config: PretrainedInferenceConfig, pipe: Pipeline,
 def infer_pretrained_with_pipe(config: PretrainedInferenceConfig, pipe: Pipeline, device='cpu'):
     if isinstance(device, str):
         device = torch.device(device)
-    with pipe_load_lock:
         pipe = pipe.to(device)
     reward_bias = 5.0
-    with reward_model_load_lock:
-        reward_fn, reward_name = rewards.ImageReward_Fk_Steering(device=device, bias=reward_bias), "image_reward_plus_5"
     image_reward_fn = lambda images: reward_fn(
         images,
         [config.prompt] * len(images)
@@ -174,11 +174,11 @@ def _get_smc_grad_duration(config: SMCGradInferenceConfig, pipe: Pipeline, devic
 def infer_smc_grad_with_pipe(config: SMCGradInferenceConfig, pipe: Pipeline, device='cpu'):
     if isinstance(device, str):
         device = torch.device(device)
-    with pipe_load_lock:
         pipe = pipe.to(device)
     reward_bias = 5.0
-    with reward_model_load_lock:
-        reward_fn, reward_name = rewards.ImageReward_Fk_Steering(device=device, bias=reward_bias), "image_reward_plus_5"
     image_reward_fn = lambda images: reward_fn(
         images,
         [config.prompt] * len(images)
@@ -240,13 +240,13 @@ def _get_ft_duration(config: FTInferenceConfig, pipe: Pipeline, device='cpu') ->
 def infer_ft_with_pipe(config: FTInferenceConfig, pipe: Pipeline, device='cpu'):
     if isinstance(device, str):
         device = torch.device(device)
-    with pipe_load_lock:
         pipe = pipe.to(device)
     with lora_load_lock:
         load_lora_weights(pipe, config.ckpt_uuid)
     reward_bias = 5.0
-    with reward_model_load_lock:
-        reward_fn, reward_name = rewards.ImageReward_Fk_Steering(device=device, bias=reward_bias), "image_reward_plus_5"
     image_reward_fn = lambda images: reward_fn(
         images,
         [config.prompt] * len(images)

 pipe_build_lock = threading.Lock()
+reward_model_build_lock = threading.Lock()
+device_load_lock = threading.Lock()
 lora_load_lock = threading.Lock()
 def infer_pretrained_with_pipe(config: PretrainedInferenceConfig, pipe: Pipeline, device='cpu'):
     if isinstance(device, str):
         device = torch.device(device)
+    with device_load_lock:
         pipe = pipe.to(device)
     reward_bias = 5.0
+    with reward_model_build_lock:
+        reward_fn, reward_name = rewards.ImageReward_Fk_Steering(device=device, device_load_lock=device_load_lock, bias=reward_bias), "image_reward_plus_5"
     image_reward_fn = lambda images: reward_fn(
         images,
         [config.prompt] * len(images)
 def infer_smc_grad_with_pipe(config: SMCGradInferenceConfig, pipe: Pipeline, device='cpu'):
     if isinstance(device, str):
         device = torch.device(device)
+    with device_load_lock:
         pipe = pipe.to(device)
     reward_bias = 5.0
+    with reward_model_build_lock:
+        reward_fn, reward_name = rewards.ImageReward_Fk_Steering(device=device, device_load_lock=device_load_lock, bias=reward_bias), "image_reward_plus_5"
     image_reward_fn = lambda images: reward_fn(
         images,
         [config.prompt] * len(images)
 def infer_ft_with_pipe(config: FTInferenceConfig, pipe: Pipeline, device='cpu'):
     if isinstance(device, str):
         device = torch.device(device)
+    with device_load_lock:
         pipe = pipe.to(device)
     with lora_load_lock:
         load_lora_weights(pipe, config.ckpt_uuid)
     reward_bias = 5.0
+    with reward_model_build_lock:
+        reward_fn, reward_name = rewards.ImageReward_Fk_Steering(device=device, device_load_lock=device_load_lock, bias=reward_bias), "image_reward_plus_5"
     image_reward_fn = lambda images: reward_fn(
         images,
         [config.prompt] * len(images)

src/smc/rewards.py CHANGED Viewed

@@ -155,13 +155,14 @@ def ImageReward(
 def ImageReward_Fk_Steering(
     inference_dtype=None,
-    device=None,
     return_loss=False,
     bias=None,
 ):
     from src.smc.scorers.image_reward_utils import rm_load
-    scorer = rm_load("ImageReward-v1.0", device=device)
     if not return_loss:
         def _fn(images, prompts):

 def ImageReward_Fk_Steering(
     inference_dtype=None,
+    device=None,
+    device_load_lock=None,
     return_loss=False,
     bias=None,
 ):
     from src.smc.scorers.image_reward_utils import rm_load
+    scorer = rm_load("ImageReward-v1.0", device=device, device_load_lock=device_load_lock)
     if not return_loss:
         def _fn(images, prompts):

src/smc/scorers/image_reward_utils.py CHANGED Viewed

@@ -261,6 +261,7 @@ class IRSMC(nn.Module):
 def rm_load(
     name: str = "ImageReward-v1.0",
     device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
     download_root: str = None,
     med_config: str = None,
 ):
@@ -303,7 +304,12 @@ def rm_load(
             download_root or os.path.expanduser("~/.cache/ImageReward"),
         )
-    model = IRSMC(device=device, med_config=med_config).to(device)
     msg = model.load_state_dict(state_dict, strict=False)
     print("checkpoint loaded")
     model.eval()

 def rm_load(
     name: str = "ImageReward-v1.0",
     device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+    device_load_lock=None,
     download_root: str = None,
     med_config: str = None,
 ):
             download_root or os.path.expanduser("~/.cache/ImageReward"),
         )
+    model = IRSMC(device=device, med_config=med_config)
+    if device_load_lock is not None:
+        with device_load_lock:
+            model = model.to(device)
+    else:
+        model = model.to(device)
     msg = model.load_state_dict(state_dict, strict=False)
     print("checkpoint loaded")
     model.eval()