from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
import torch
from torchvision import transforms
from PIL import Image
import os
from tqdm import tqdm

def initialize_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_dict = {}
    model, preprocess_train, preprocess_val = create_model_and_transforms(
        'ViT-H-14',
        '/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/pytorch_model.bin',
        precision='amp',
        device=device,
        jit=False,
        force_quick_gelu=False,
        force_custom_text=False,
        force_patch_dropout=False,
        force_image_size=None,
        pretrained_image=False,
        image_mean=None,
        image_std=None,
        light_augmentation=True,
        aug_cfg={},
        output_dict=True,
        with_score_predictor=False,
        with_region_predictor=False
    )
    model_dict['model'] = model
    model_dict['preprocess_val'] = preprocess_val
    return model_dict, device

def load_images_from_folder(folder):
    images = []
    filenames = []
    for filename in os.listdir(folder):
        if filename.endswith(".png"):
            img_path = os.path.join(folder, filename)
            image = Image.open(img_path).convert("RGB")
            images.append(image)
            filenames.append(filename)
    return images, filenames

def main():
    model_dict, device = initialize_model()
    model = model_dict['model']
    preprocess_val = model_dict['preprocess_val']

    cp = "/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt"
    checkpoint = torch.load(cp, map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    tokenizer = get_tokenizer('ViT-H-14')
    reward_model = model.to(device)
    reward_model.eval()

    img_folder = "IMAGE_SAVE_FOLDER"
    images, filenames = load_images_from_folder(img_folder)

    eval_rewards = []
    with torch.no_grad():
        for image_pil, filename in tqdm(zip(images, filenames), total=400):

            image = preprocess_val(image_pil).unsqueeze(0).to(device=device, non_blocking=True)
            prompt = os.path.splitext(filename)[0]  # 剔除文件扩展名
            text = tokenizer([prompt]).to(device=device, non_blocking=True)
            outputs = reward_model(image, text)
            image_features, text_features = outputs["image_features"], outputs["text_features"]
            logits_per_image = image_features @ text_features.T
            hps_score = torch.diagonal(logits_per_image).item()  # 转换为 Python 数值
            eval_rewards.append(hps_score)

    avg_reward = sum(eval_rewards) / len(eval_rewards) if eval_rewards else 0
    print(f"Average HPS score: {avg_reward:.4f}")

if __name__ == "__main__":
    main()