Spaces:

cp524
/

smc_meissonic

Running on Zero

App Files Files Community

cp524 commited on Oct 3, 2025

Commit

78f8c32

1 Parent(s): 2b3be43

Add reward fns

Browse files

Files changed (8) hide show

src/smc/rewards.py +241 -0
src/smc/scorers/ImageReward_scorer.py +73 -0
src/smc/scorers/PickScore_scorer.py +41 -0
src/smc/scorers/__init__.py +0 -0
src/smc/scorers/aesthetic_scorer.py +54 -0
src/smc/scorers/clip_scorer.py +41 -0
src/smc/scorers/hpsv2_scorer.py +56 -0
src/smc/scorers/image_reward_utils.py +311 -0

src/smc/rewards.py ADDED Viewed

	@@ -0,0 +1,241 @@

+from PIL import Image
+import torch
+from importlib import resources
+ASSETS_PATH = resources.files("assets")
+def jpeg_compressibility(inference_dtype=None, device=None):
+    import io
+    import numpy as np
+    def loss_fn(images):
+        if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+        if isinstance(images, torch.Tensor):
+            images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy()
+            images = images.transpose(0, 2, 3, 1)  # NCHW -> NHWC
+        images = [Image.fromarray(image) for image in images]
+        buffers = [io.BytesIO() for _ in images]
+        for image, buffer in zip(images, buffers):
+            image.save(buffer, format="JPEG", quality=95)
+        sizes = [buffer.tell() / 1000 for buffer in buffers]
+        loss = torch.tensor(sizes, dtype=inference_dtype, device=device)
+        rewards = -1 * loss
+        return loss, rewards
+    return loss_fn
+def clip_score(
+    inference_dtype=None,
+    device=None,
+    return_loss=False,
+):
+    from src.smc.scorers.clip_scorer import CLIPScorer
+    scorer = CLIPScorer(dtype=torch.float32, device=device)
+    scorer.requires_grad_(False)
+    if not return_loss:
+        def _fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            return scores
+        return _fn
+    else:
+        def loss_fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            loss = - scores
+            return loss, scores
+        return loss_fn
+def aesthetic_score(
+    torch_dtype=None,
+    aesthetic_target=None,
+    grad_scale=0,
+    device=None,
+    return_loss=False,
+):
+    from src.smc.scorers.aesthetic_scorer import AestheticScorer
+    scorer = AestheticScorer(dtype=torch.float32, device=device)
+    scorer.requires_grad_(False)
+    if not return_loss:
+        def _fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images)
+            return scores
+        return _fn
+    else:
+        def loss_fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images)
+            if aesthetic_target is None: # default maximization
+                loss = -1 * scores
+            else:
+                # using L1 to keep on same scale
+                loss = abs(scores - aesthetic_target)
+            return loss * grad_scale, scores
+        return loss_fn
+def hps_score(
+    inference_dtype=None,
+    device=None,
+    return_loss=False,
+):
+    from src.smc.scorers.hpsv2_scorer import HPSv2Scorer
+    scorer = HPSv2Scorer(dtype=torch.float32, device=device)
+    scorer.requires_grad_(False)
+    if not return_loss:
+        def _fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            return scores
+        return _fn
+    else:
+        def loss_fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            loss = 1.0 - scores
+            return loss, scores
+        return loss_fn
+def ImageReward(
+    inference_dtype=None,
+    device=None,
+    return_loss=False,
+):
+    from src.smc.scorers.ImageReward_scorer import ImageRewardScorer
+    scorer = ImageRewardScorer(dtype=torch.float32, device=device)
+    scorer.requires_grad_(False)
+    if not return_loss:
+        def _fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            return scores
+        return _fn
+    else:
+        def loss_fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            loss = - scores
+            return loss, scores
+        return loss_fn
+def ImageReward_Fk_Steering(
+    inference_dtype=None,
+    device=None,
+    return_loss=False,
+    bias=None,
+):
+    from src.smc.scorers.image_reward_utils import rm_load
+    scorer = rm_load("ImageReward-v1.0")
+    if not return_loss:
+        def _fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer.score_batched(prompts, images)
+            if bias:
+                scores += bias
+            return scores
+        return _fn
+    else:
+        def loss_fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer.score_batched(prompts, images)
+            loss = - scores
+            return loss, scores
+        return loss_fn
+def PickScore(
+    inference_dtype=None,
+    device=None,
+    return_loss=False,
+):
+    from src.smc.scorers.PickScore_scorer import PickScoreScorer
+    scorer = PickScoreScorer(dtype=torch.float32, device=device)
+    scorer.requires_grad_(False)
+    if not return_loss:
+        def _fn(images, prompts):
+            # from src.plot_utils import save_batch_images
+            # save_batch_images(images, "output_SMC")
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            return scores
+        return _fn
+    else:
+        def loss_fn(images, prompts):
+            if images.min() < 0: # normalize unnormalized images
+                images = ((images / 2) + 0.5).clamp(0, 1)
+            scores = scorer(images, prompts)
+            loss = - scores
+            return loss, scores
+        return loss_fn
+def color_match_reward(x: torch.Tensor, target_color: torch.Tensor) -> torch.Tensor:
+    """
+    Reward images whose *mean* RGB comes close to a given target color.
+    Args:
+      x             : [B, 3, H, W] float images (e.g. in [0,1] or [0,255])
+      target_color  : [3]    float tensor with your desired RGB mean
+    Returns:
+      reward        : [B]    higher when image mean-color ≈ target_color
+    """
+    B, C, H, W = x.shape
+    # compute per-image mean color vector [B,3]
+    mean_color = x.view(B, C, -1).mean(dim=2)
+    # squared distance in RGB space
+    dist2 = (mean_color - target_color[None, :].to(x.device)).pow(2).sum(dim=1)
+    # negative distance = higher reward for closer color
+    return -dist2

src/smc/scorers/ImageReward_scorer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import torch
+import torch.nn as nn
+from transformers import CLIPProcessor
+from ImageReward.models.BLIP.blip_pretrain import BLIP_Pretrain
+from ImageReward import ImageReward_download
+class MLP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(768, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            nn.Linear(16, 1),
+        )
+    @torch.no_grad()
+    def forward(self, embed):
+        return self.layers(embed)
+class ImageRewardScorer(nn.Module):
+    def __init__(self, dtype, device):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        download_root = "/vol/bitbucket/cp524/cache/ImageReward"
+        config_path = ImageReward_download("https://huggingface.co/THUDM/ImageReward/blob/main/med_config.json", download_root)
+        model_path = ImageReward_download("https://huggingface.co/THUDM/ImageReward/blob/main/ImageReward.pt", download_root)
+        # config_path = os.path.join(download_root, "med_config.json")
+        # model_path = os.path.join(download_root, "ImageReward.pt")
+        self.blip = BLIP_Pretrain(image_size=224, vit='large', med_config=config_path).to(self.device, dtype=self.dtype)
+        self.mlp = MLP().to(self.device, dtype=self.dtype)
+        state_dict = torch.load(model_path, map_location=self.device)
+        self.load_state_dict(state_dict, strict=False)
+        self.eval()
+    @torch.no_grad()
+    def __call__(self, images, prompts):
+        images = (images * 255).round().clamp(0, 255).to(torch.uint8)
+        inputs = self.processor(images=images, return_tensors="pt")
+        inputs = {k: v.to(self.dtype).to(self.device) for k, v in inputs.items()}["pixel_values"]
+        image_embeds = self.blip.visual_encoder(inputs)
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.device)
+        text_input = self.blip.tokenizer(
+            prompts,
+            padding='max_length',
+            truncation=True,
+            max_length=35,
+            return_tensors="pt"
+        ).to(self.device)
+        text_output = self.blip.text_encoder(
+            text_input.input_ids,
+            attention_mask=text_input.attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+        )
+        txt_features = text_output.last_hidden_state[:, 0, :].to(dtype=self.dtype)
+        scores = self.mlp(txt_features).squeeze(1)
+        return scores

src/smc/scorers/PickScore_scorer.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import torch
+from transformers import AutoModel, CLIPProcessor
+import torchvision
+class PickScoreScorer(torch.nn.Module):
+    def __init__(self, dtype, device):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        checkpoint_path = "yuvalkirstain/PickScore_v1"
+        # checkpoint_path = f"{os.path.expanduser('~')}/.cache/PickScore_v1"
+        self.model = AutoModel.from_pretrained(checkpoint_path).eval().to(self.device, dtype=self.dtype)
+        self.target_size =  224
+        self.normalize = torchvision.transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
+                                                    std=[0.26862954, 0.26130258, 0.27577711])
+    def __call__(self, images, prompts):
+        text_inputs = self.processor(
+            text=prompts,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        ).to(self.device)
+        text_embeds = self.model.get_text_features(**text_inputs)
+        text_embeds = text_embeds / torch.norm(text_embeds, dim=-1, keepdim=True)
+        inputs = torchvision.transforms.Resize(self.target_size)(images)
+        inputs = self.normalize(inputs).to(self.dtype)
+        image_embeds = self.model.get_image_features(pixel_values=inputs)
+        image_embeds = image_embeds / torch.norm(image_embeds, dim=-1, keepdim=True)
+        logits_per_image = image_embeds @ text_embeds.T
+        scores = torch.diagonal(logits_per_image)
+        return scores

src/smc/scorers/__init__.py ADDED Viewed

File without changes

src/smc/scorers/aesthetic_scorer.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from importlib import resources
+import torch
+import torch.nn as nn
+from transformers import CLIPModel, CLIPProcessor
+import torchvision
+ASSETS_PATH = resources.files("assets")
+class MLP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(768, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            nn.Linear(16, 1),
+        )
+    def forward(self, embed):
+        return self.layers(embed)
+class AestheticScorer(nn.Module):
+    def __init__(self, dtype, device):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device, dtype=self.dtype)
+        self.mlp = MLP().to(self.device, dtype=self.dtype)
+        state_dict = torch.load(ASSETS_PATH.joinpath("sac+logos+ava1-l14-linearMSE.pth"), map_location=self.device)
+        self.mlp.load_state_dict(state_dict)
+        self.target_size =  224
+        self.normalize = torchvision.transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
+                                                    std=[0.26862954, 0.26130258, 0.27577711])
+        self.eval()
+    def __call__(self, images):
+        inputs = torchvision.transforms.Resize(self.target_size)(images)
+        inputs = self.normalize(inputs).to(self.dtype)
+        embed = self.clip.get_image_features(pixel_values=inputs)
+        embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
+        return self.mlp(embed).squeeze(1)

src/smc/scorers/clip_scorer.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import torch
+from transformers import CLIPProcessor, CLIPModel
+import torchvision
+class CLIPScorer(torch.nn.Module):
+    def __init__(self, dtype, device):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device, dtype=self.dtype)
+        self.target_size =  224
+        self.normalize = torchvision.transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
+                                                    std=[0.26862954, 0.26130258, 0.27577711])
+    def __call__(self, images, prompts):
+        text_inputs = self.processor(
+            text=prompts,
+            padding=True,
+            truncation=True,
+            max_length=77,
+            return_tensors="pt",
+        ).to(self.device)
+        []
+        text_embeds = self.model.get_text_features(**text_inputs)
+        text_embeds = text_embeds / torch.norm(text_embeds, dim=-1, keepdim=True)
+        inputs = torchvision.transforms.Resize(self.target_size)(images)
+        inputs = self.normalize(inputs).to(self.dtype)
+        image_embeds = self.model.get_image_features(pixel_values=inputs)
+        image_embeds = image_embeds / torch.norm(image_embeds, dim=-1, keepdim=True)
+        logits_per_image = image_embeds @ text_embeds.T
+        scores = torch.diagonal(logits_per_image)
+        return scores

src/smc/scorers/hpsv2_scorer.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import torch
+from transformers import CLIPProcessor
+import hpsv2
+from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
+class HPSv2Scorer(torch.nn.Module):
+    def __init__(self, dtype, device):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.model, _, _ = create_model_and_transforms(
+            'ViT-H-14',
+            'laion2B-s32B-b79K',
+            precision=self.dtype,
+            device=self.device,
+            jit=False,
+            force_quick_gelu=False,
+            force_custom_text=False,
+            force_patch_dropout=False,
+            force_image_size=None,
+            pretrained_image=False,
+            image_mean=None,
+            image_std=None,
+            light_augmentation=True,
+            aug_cfg={},
+            output_dict=True,
+            with_score_predictor=False,
+            with_region_predictor=False
+        )
+        checkpoint_path = f"{os.path.expanduser('~')}/.cache/huggingface/hub/models--xswu--HPSv2/snapshots/697403c78157020a1ae59d23f111aa58ced35b0a/HPS_v2_compressed.pt"
+        # force download of model via score
+        hpsv2.score([], "")
+        checkpoint = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(checkpoint['state_dict'])
+        self.tokenizer = get_tokenizer('ViT-H-14')
+        self.model = self.model.to(self.device, dtype=self.dtype)
+        self.model.eval()
+    @torch.no_grad()
+    def __call__(self, images, prompts):
+        images = (images * 255).round().clamp(0, 255).to(torch.uint8)
+        inputs = self.processor(images=images, return_tensors="pt")
+        inputs = {k: v.to(self.dtype).to(self.device) for k, v in inputs.items()}["pixel_values"]
+        text = self.tokenizer(prompts).to(self.device)
+        outputs = self.model(inputs, text)
+        image_features, text_features = outputs["image_features"], outputs["text_features"]
+        logits_per_image = image_features @ text_features.T
+        scores = torch.diagonal(logits_per_image)
+        return scores

src/smc/scorers/image_reward_utils.py ADDED Viewed

	@@ -0,0 +1,311 @@

+from typing import Union
+import os
+import torch
+from PIL import Image
+import ImageReward as RM
+'''
+@File       :   ImageReward.py
+@Time       :   2023/01/28 19:53:00
+@Auther     :   Jiazheng Xu
+@Contact    :   xjz22@mails.tsinghua.edu.cn
+@Description:   ImageReward Reward model.
+* Based on CLIP code base and improved-aesthetic-predictor code base
+* https://github.com/openai/CLIP
+* https://github.com/christophschuhmann/improved-aesthetic-predictor
+'''
+import os
+import torch
+import torch.nn as nn
+from PIL import Image
+from ImageReward.models.BLIP.blip_pretrain import BLIP_Pretrain
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from torchvision.transforms.functional import pil_to_tensor
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose(
+        [
+            Resize(n_px, interpolation=BICUBIC),
+            CenterCrop(n_px),
+            # _convert_image_to_rgb,
+            # ToTensor(),
+            Normalize(
+                (0.48145466, 0.4578275, 0.40821073),
+                (0.26862954, 0.26130258, 0.27577711),
+            ),
+        ]
+    )
+class MLP(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.layers = nn.Sequential(
+            nn.Linear(self.input_size, 1024),
+            # nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            # nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            # nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            # nn.ReLU(),
+            nn.Linear(16, 1),
+        )
+        # initial MLP param
+        for name, param in self.layers.named_parameters():
+            if 'weight' in name:
+                nn.init.normal_(param, mean=0.0, std=1.0 / (self.input_size + 1))
+            if 'bias' in name:
+                nn.init.constant_(param, val=0)
+    def forward(self, input):
+        return self.layers(input)
+class IRSMC(nn.Module):
+    def __init__(self, med_config, device='cpu'):
+        super().__init__()
+        self.device = device
+        self.blip = BLIP_Pretrain(image_size=224, vit='large', med_config=med_config)
+        self.preprocess = _transform(224)
+        self.mlp = MLP(768)
+        self.mean = 0.16717362830052426
+        self.std = 1.0333394966054072
+    def score_batched_old(self, prompts, images):
+        # batch
+        results = []
+        for i, prompt in enumerate(prompts):
+            results.append(self.score(prompt, images[i]))
+        return results
+    def score_gard(self, prompt_ids, prompt_attention_mask, image):
+        image_embeds = self.blip.visual_encoder(image)
+        # text encode cross attention with image
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
+            self.device
+        )
+        text_output = self.blip.text_encoder(
+            prompt_ids,
+            attention_mask=prompt_attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+        )
+        txt_features = text_output.last_hidden_state[:, 0, :]  # (feature_dim)
+        rewards = self.mlp(txt_features)
+        rewards = (rewards - self.mean) / self.std
+        return rewards
+    def score(self, prompt, image):
+        if type(image).__name__ == 'list':
+            _, rewards = self.inference_rank(prompt, image)
+            return rewards
+        # text encode
+        text_input = self.blip.tokenizer(
+            prompt,
+            padding='max_length',
+            truncation=True,
+            max_length=35,
+            return_tensors="pt",
+        ).to(self.device)
+        # image encode
+        if isinstance(image, Image.Image):
+            pil_image = image
+        elif isinstance(image, str) and os.path.isfile(image):
+            pil_image = Image.open(image)
+        else:
+            raise TypeError(
+                r'This image parameter type has not been supportted yet. Please pass PIL.Image or file path str.'
+            )
+        image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
+        image_embeds = self.blip.visual_encoder(image)
+        # text encode cross attention with image
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
+            self.device
+        )
+        text_output = self.blip.text_encoder(
+            text_input.input_ids,
+            attention_mask=text_input.attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+        )
+        txt_features = text_output.last_hidden_state[:, 0, :].float()  # (feature_dim)
+        rewards = self.mlp(txt_features)
+        rewards = (rewards - self.mean) / self.std
+        return rewards.detach().cpu().numpy().item()
+    def score_batched(self, prompts, images):
+        assert isinstance(prompts, list)
+        assert isinstance(images, list) or isinstance(images, torch.Tensor)
+        # text encode
+        text_input = self.blip.tokenizer(
+            prompts,
+            padding='max_length',
+            truncation=True,
+            max_length=35,
+            return_tensors="pt",
+        ).to(self.device)
+        # image encode
+        images = [
+            self.preprocess(image).unsqueeze(0).to(self.device) for image in images
+        ]
+        images = torch.cat(images, 0).to(torch.float32).to(self.device)
+        image_embeds = self.blip.visual_encoder(images)
+        # text encode cross attention with image
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
+            self.device
+        )
+        text_output = self.blip.text_encoder(
+            text_input.input_ids,
+            attention_mask=text_input.attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+        )
+        txt_features = text_output.last_hidden_state[:, 0, :].float()  # (feature_dim)
+        rewards = self.mlp(txt_features)
+        rewards = (rewards - self.mean) / self.std
+        return rewards.view(txt_features.shape[0])
+    def inference_rank(self, prompt, generations_list):
+        text_input = self.blip.tokenizer(
+            prompt,
+            padding='max_length',
+            truncation=True,
+            max_length=35,
+            return_tensors="pt",
+        ).to(self.device)
+        txt_set = []
+        for generation in generations_list:
+            # image encode
+            if isinstance(generation, Image.Image):
+                pil_image = generation
+            elif isinstance(generation, str):
+                if os.path.isfile(generation):
+                    pil_image = Image.open(generation)
+            else:
+                raise TypeError(
+                    r'This image parameter type has not been supportted yet. Please pass PIL.Image or file path str.'
+                )
+            image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
+            image_embeds = self.blip.visual_encoder(image)
+            # text encode cross attention with image
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
+                self.device
+            )
+            text_output = self.blip.text_encoder(
+                text_input.input_ids,
+                attention_mask=text_input.attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+            txt_set.append(text_output.last_hidden_state[:, 0, :])
+        txt_features = torch.cat(txt_set, 0).float()  # [image_num, feature_dim]
+        rewards = self.mlp(txt_features)  # [image_num, 1]
+        rewards = (rewards - self.mean) / self.std
+        rewards = torch.squeeze(rewards)
+        _, rank = torch.sort(rewards, dim=0, descending=True)
+        _, indices = torch.sort(rank, dim=0)
+        indices = indices + 1
+        return (
+            indices.detach().cpu().numpy().tolist(),
+            rewards.detach().cpu().numpy().tolist(),
+        )
+def rm_load(
+    name: str = "ImageReward-v1.0",
+    device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+    download_root: str = None,
+    med_config: str = None,
+):
+    """Load a ImageReward model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `ImageReward.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/ImageReward"
+    Returns
+    -------
+    model : torch.nn.Module
+        The ImageReward model
+    """
+    if name in RM.utils._MODELS:
+        model_path = RM.ImageReward_download(
+            RM.utils._MODELS[name],
+            download_root or os.path.expanduser("~/.cache/ImageReward"),
+        )
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found;")
+    print('load checkpoint from %s' % model_path)
+    state_dict = torch.load(model_path, map_location='cpu')
+    # state_dict = torch.load(model_path, map_location=device)
+    # med_config
+    if med_config is None:
+        med_config = RM.ImageReward_download(
+            "https://huggingface.co/THUDM/ImageReward/blob/main/med_config.json",
+            download_root or os.path.expanduser("~/.cache/ImageReward"),
+        )
+    model = IRSMC(device=device, med_config=med_config).to(device)
+    msg = model.load_state_dict(state_dict, strict=False)
+    print("checkpoint loaded")
+    model.eval()
+    # import pdb; pdb.set_trace()
+    return model