Spaces:

sdsdgwe
/

HPSv3

Runtime error

App Files Files Community

sdsdgwe commited on Aug 17, 2025

Commit

9b57ce7

1 Parent(s): ef8733b

update

Browse files

Files changed (38) hide show

LICENSE +21 -0
environment.yaml +223 -0
evaluate/README.md +107 -0
evaluate/benchmark.py +463 -0
evaluate/evaluate.py +203 -0
generate/README.md +104 -0
generate/__init__.py +0 -0
generate/gen_images_from_prompt.py +117 -0
generate/generator.py +211 -0
generate/utils/__init__.py +0 -0
generate/utils/pipelines.py +282 -0
generate/utils/utils.py +58 -0
hpsv3/__init__.py +1 -0
hpsv3/cohp/__init__.py +0 -0
hpsv3/cohp/cohp_all.py +290 -0
hpsv3/cohp/generator.py +64 -0
hpsv3/cohp/run_cohp.py +244 -0
hpsv3/cohp/utils_cohp/__init__.py +0 -0
hpsv3/cohp/utils_cohp/image2image_pipeline.py +65 -0
hpsv3/cohp/utils_cohp/pipelines.py +290 -0
hpsv3/cohp/utils_cohp/utils.py +53 -0
hpsv3/config/HPSv3_7B.yaml +60 -0
hpsv3/config/ds_config/zero0.json +19 -0
hpsv3/config/ds_config/zero2.json +23 -0
hpsv3/config/ds_config/zero3.json +28 -0
hpsv3/dataset/data_collator_qwen.py +205 -0
hpsv3/dataset/pairwise_dataset.py +77 -0
hpsv3/dataset/utils.py +426 -0
hpsv3/inference.py +167 -0
hpsv3/model/differentiable_image_processor.py +629 -0
hpsv3/model/qwen2vl_trainer.py +971 -0
hpsv3/model/test_differentiable.py +212 -0
hpsv3/train.py +315 -0
hpsv3/utils/parser.py +150 -0
hpsv3/utils/training_utils.py +158 -0
pretrained_models/download_pretrained_models.sh +63 -0
pyproject.toml +64 -0
requirements.txt +2 -2

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 HPSv3 Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

environment.yaml ADDED Viewed

	@@ -0,0 +1,223 @@

+name: hpsv3
+channels:
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - bzip2=1.0.8=h4bc722e_7
+  - ca-certificates=2025.4.26=hbd8a1cb_0
+  - ld_impl_linux-64=2.43=h712a8e2_4
+  - libexpat=2.7.0=h5888daf_0
+  - libffi=3.4.6=h2dba641_1
+  - libgcc=15.1.0=h767d61c_2
+  - libgcc-ng=15.1.0=h69a702a_2
+  - libgomp=15.1.0=h767d61c_2
+  - liblzma=5.8.1=hb9d3cd8_1
+  - libnsl=2.0.1=hd590300_0
+  - libsqlite=3.50.0=hee588c1_0
+  - libuuid=2.38.1=h0b41bf4_0
+  - libxcrypt=4.4.36=hd590300_1
+  - libzlib=1.3.1=hb9d3cd8_2
+  - ncurses=6.5=h2d0b736_3
+  - openssl=3.5.0=h7b32b05_1
+  - pip=25.1.1=pyh8b19718_0
+  - python=3.10.17=hd6af730_0_cpython
+  - readline=8.2=h8c095d6_2
+  - setuptools=80.8.0=pyhff2d567_0
+  - tk=8.6.13=noxft_hd72426e_102
+  - wheel=0.45.1=pyhd8ed1ab_1
+  - pip:
+      - absl-py==2.3.0
+      - accelerate==1.8.0
+      - aiohappyeyeballs==2.6.1
+      - aiohttp==3.12.12
+      - aiosignal==1.3.2
+      - annotated-types==0.7.0
+      - antlr4-python3-runtime==4.9.3
+      - anyio==4.9.0
+      - argon2-cffi==23.1.0
+      - argon2-cffi-bindings==21.2.0
+      - arrow==1.3.0
+      - asttokens==3.0.0
+      - async-lru==2.0.5
+      - async-timeout==5.0.1
+      - attrs==25.3.0
+      - av==14.4.0
+      - babel==2.17.0
+      - beautifulsoup4==4.13.4
+      - bleach==6.2.0
+      - botocore==1.38.35
+      - certifi==2025.4.26
+      - cffi==1.17.1
+      - charset-normalizer==3.4.2
+      - comm==0.2.2
+      - contourpy==1.3.2
+      - cycler==0.12.1
+      - datasets==3.6.0
+      - debugpy==1.8.14
+      - decorator==5.2.1
+      - deepspeed==0.15.4
+      - defusedxml==0.7.1
+      - diffusers==0.33.1
+      - dill==0.3.8
+      - docstring-parser==0.16
+      - einops==0.8.1
+      - exceptiongroup==1.3.0
+      - executing==2.2.0
+      - fastjsonschema==2.21.1
+      - filelock==3.13.1
+      - fire==0.7.0
+      - fonttools==4.58.1
+      - fqdn==1.5.1
+      - frozenlist==1.7.0
+      - fsspec==2024.6.1
+      - grpcio==1.72.1
+      - h11==0.16.0
+      - hf-xet==1.1.3
+      - hjson==3.1.0
+      - httpcore==1.0.9
+      - httpx==0.28.1
+      - huggingface-hub==0.32.4
+      - idna==3.10
+      - imageio==2.37.0
+      - importlib-metadata==8.7.0
+      - ipykernel==6.29.5
+      - ipython==8.36.0
+      - ipywidgets==8.1.7
+      - isoduration==20.11.0
+      - jedi==0.19.2
+      - jinja2==3.1.6
+      - jmespath==1.0.1
+      - json5==0.12.0
+      - jsonpointer==3.0.0
+      - jsonschema==4.24.0
+      - jsonschema-specifications==2025.4.1
+      - jupyter==1.1.1
+      - jupyter-client==8.6.3
+      - jupyter-console==6.6.3
+      - jupyter-core==5.8.1
+      - jupyter-events==0.12.0
+      - jupyter-lsp==2.2.5
+      - jupyter-server==2.16.0
+      - jupyter-server-terminals==0.5.3
+      - jupyterlab==4.4.3
+      - jupyterlab-pygments==0.3.0
+      - jupyterlab-server==2.27.3
+      - jupyterlab-widgets==3.0.15
+      - kiwisolver==1.4.8
+      - markdown==3.8
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - matplotlib==3.10.3
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - mistune==3.1.3
+      - mpmath==1.3.0
+      - msgpack==1.1.0
+      - multidict==6.4.4
+      - multiprocess==0.70.16
+      - nbclient==0.10.2
+      - nbconvert==7.16.6
+      - nbformat==5.10.4
+      - nest-asyncio==1.6.0
+      - networkx==3.3
+      - ninja==1.11.1.4
+      - notebook==7.4.3
+      - notebook-shim==0.2.4
+      - numpy==2.1.2
+      - nvidia-cublas-cu11==11.11.3.6
+      - nvidia-cuda-cupti-cu11==11.8.87
+      - nvidia-cuda-nvrtc-cu11==11.8.89
+      - nvidia-cuda-runtime-cu11==11.8.89
+      - nvidia-cudnn-cu11==9.1.0.70
+      - nvidia-cufft-cu11==10.9.0.58
+      - nvidia-curand-cu11==10.3.0.86
+      - nvidia-cusolver-cu11==11.4.1.48
+      - nvidia-cusparse-cu11==11.7.5.86
+      - nvidia-ml-py==12.575.51
+      - nvidia-nccl-cu11==2.21.5
+      - nvidia-nvtx-cu11==11.8.86
+      - omegaconf==2.3.0
+      - opencv-python==4.11.0.86
+      - overrides==7.7.0
+      - packaging==25.0
+      - pandas==2.3.0
+      - pandocfilters==1.5.1
+      - parso==0.8.4
+      - peft==0.10.0
+      - pexpect==4.9.0
+      - pillow==11.0.0
+      - platformdirs==4.3.8
+      - prometheus-client==0.22.0
+      - prompt-toolkit==3.0.51
+      - propcache==0.3.2
+      - protobuf==6.31.1
+      - psutil==7.0.0
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.3
+      - py-cpuinfo==9.0.0
+      - pyarrow==20.0.0
+      - pycparser==2.22
+      - pydantic==2.11.5
+      - pydantic-core==2.33.2
+      - pygments==2.19.1
+      - pyparsing==3.2.3
+      - python-dateutil==2.9.0.post0
+      - python-json-logger==3.3.0
+      - pytz==2025.2
+      - pyyaml==6.0.2
+      - pyzmq==26.4.0
+      - prettytable==3.8.0
+      - qwen-vl-utils==0.0.11
+      - referencing==0.36.2
+      - regex==2024.11.6
+      - requests==2.32.3
+      - rfc3339-validator==0.1.4
+      - rfc3986-validator==0.1.1
+      - rich==14.0.0
+      - rpds-py==0.25.1
+      - safetensors==0.5.3
+      - send2trash==1.8.3
+      - sentencepiece==0.2.0
+      - shtab==1.7.2
+      - six==1.17.0
+      - sniffio==1.3.1
+      - soupsieve==2.7
+      - stack-data==0.6.3
+      - sympy==1.13.1
+      - tensorboard==2.19.0
+      - tensorboard-data-server==0.7.2
+      - termcolor==3.1.0
+      - terminado==0.18.1
+      - timm==1.0.15
+      - tinycss2==1.4.0
+      - tokenizers==0.20.3
+      - tomli==2.2.1
+      - torch==2.6.0
+      - torchaudio==2.6.0
+      - torchvision==0.21.0
+      - tornado==6.5.1
+      - tqdm==4.67.1
+      - traitlets==5.14.3
+      - transformers==4.45.2
+      - triton==3.2.0
+      - trl==0.8.6
+      - typeguard==4.4.3
+      - types-python-dateutil==2.9.0.20250516
+      - typing-extensions==4.14.0
+      - typing-inspection==0.4.1
+      - tyro==0.9.24
+      - tzdata==2025.2
+      - uri-template==1.3.0
+      - urllib3==2.4.0
+      - wcwidth==0.2.13
+      - webcolors==24.11.1
+      - webencodings==0.5.1
+      - websocket-client==1.8.0
+      - werkzeug==3.1.3
+      - widgetsnbextension==4.0.14
+      - xxhash==3.5.0
+      - yarl==1.20.1
+      - zipp==3.22.0

evaluate/README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+## Model Performance Evaluation (`evaluate.py`)
+This script is used to evaluate the model's performance on a test set. It can operate in two modes:
+-   **`pair`**: Calculates pairwise accuracy.
+-   **`ranking`**: Calculates ranking accuracy.
+**Pair-wise Sample**
+We set path1's image is better than path2's image for simplicity.
+```json
+[
+    {
+        "prompt": ".....",
+        "path1": ".....",
+        "path2": "....."
+    },
+    {
+        "prompt": ".....",
+        "path1": ".....",
+        "path2": "....."
+    },
+  ...
+]
+```
+**Rank-wise Sample**
+```json
+[
+    {
+        "id": "005658-0040",
+        "prompt": ".....",
+        "generations": [
+            "path to image1",
+            "path to image2",
+            "path to image3",
+            "path to image4"
+        ],
+        "ranking": [
+            1,
+            2,
+            5,
+            3
+        ]
+    },
+  ...
+]
+```
+### Usage
+```bash
+python evaluate/evaluate.py \
+  --test_json /path/to/your/test_data.json \
+  --config_path config/HPSv3_7B.yaml \
+  --checkpoint_path checkpoints/HPSv3_7B/model.pth \
+  --mode pair \
+  --batch_size 8 \
+  --num_processes 8
+```
+**Arguments:**
+-   `--test_json`: (Required) Path to the JSON file containing evaluation data.
+-   `--config_path`: (Required) Path to the model's configuration file.
+-   `--checkpoint_path`: (Required) Path to the model checkpoint.
+-   `--mode`: The evaluation mode. Can be `pair` or `ranking`. (Default: `pair`)
+-   `--batch_size`: Batch size for inference. (Default: 8)
+-   `--num_processes`: Number of parallel processes to use. (Default: 8)
+---
+## Reward Benchmarking (`benchmark.py`)
+This script is used to run inference with a reward model over one or more folders of images. It calculates a reward score for each image based on its corresponding text prompt (expected in a `.txt` file with the same name). The script then outputs statistics (mean, std, min, max) for each folder and saves the detailed results to a JSON file.
+It supports multiple reward models through the `--model_type` argument.
+### Usage
+The script is run using `argparse`. Below is a command-line example:
+```bash
+python evaluate/benchmark.py \
+  --config_path config/HPSv3_7B.yaml \
+  --checkpoint_path checkpoints/HPSv3_7B/model.pth \
+  --model_type hpsv3 \
+  --image_folders /path/to/images/folder1 /path/to/images/folder2 \
+  --output_path ./benchmark_results.json \
+  --batch_size 16 \
+  --num_processes 8
+```
+**Arguments:**
+-   `--config_path`: (Required) Path to the model's configuration file.
+-   `--checkpoint_path`: (Required) Path to the model checkpoint.
+-   `--model_type`: The reward model to use. Choices: `hpsv3`, `hpsv2`, `imagereward`. (Default: `hpsv3`)
+-   `--image_folders`: (Required) One or more paths to folders containing the images to benchmark.
+-   `--output_path`: (Required) Path to save the output JSON file with results.
+-   `--batch_size`: Batch size for processing. (Default: 16)
+-   `--num_processes`: Number of parallel processes to use. (Default: 8)
+-   `--num_machines`: For distributed inference, the total number of machines. (Default: 1)
+-   `--machine_id`: For distributed inference, the ID of the current machine. (Default: 0)

evaluate/benchmark.py ADDED Viewed

	@@ -0,0 +1,463 @@

+import os
+import json
+import torch
+import multiprocessing as mp
+from tqdm import tqdm
+from hpsv3.inference import HPSv3RewardInferencer
+import argparse
+from collections import defaultdict
+import glob
+import numpy as np
+from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
+from PIL import Image
+import ImageReward as RM
+from transformers import AutoProcessor, AutoModel
+def initialize_model_hpsv2(device, cp):
+    model_dict = {}
+    model, preprocess_train, preprocess_val = create_model_and_transforms(
+        'ViT-H-14',
+        'laion2B-s32B-b79K',
+        precision='amp',
+        device=device,
+        jit=False,
+        force_quick_gelu=False,
+        force_custom_text=False,
+        force_patch_dropout=False,
+        force_image_size=None,
+        pretrained_image=False,
+        image_mean=None,
+        image_std=None,
+        light_augmentation=True,
+        aug_cfg={},
+        output_dict=True,
+        with_score_predictor=False,
+        with_region_predictor=False
+    )
+    checkpoint = torch.load(cp, map_location=device, weights_only=False)
+    model.load_state_dict(checkpoint['state_dict'])
+    model = model.to(device)
+    model.eval()
+    tokenizer = get_tokenizer('ViT-H-14')
+    model_dict['model'] = model
+    model_dict['preprocess_val'] = preprocess_val
+    return model_dict, tokenizer
+def initialize_pickscore(device, checkpoint_path):
+    processor = AutoProcessor.from_pretrained('laion/CLIP-ViT-H-14-laion2B-s32B-b79K')
+    model = AutoModel.from_pretrained(checkpoint_path).eval().to(device)
+    return model, processor
+def initialize_aesthetic_model():
+    import open_clip
+    from os.path import expanduser
+    from urllib.request import urlretrieve
+    import torch.nn as nn
+    def get_aesthetic_model(clip_model="vit_l_14"):
+        """Load the aesthetic model with caching"""
+        home = expanduser("~")
+        cache_folder = home + "/.cache/emb_reader"
+        path_to_model = cache_folder + "/sa_0_4_"+clip_model+"_linear.pth"
+        if not os.path.exists(path_to_model):
+            os.makedirs(cache_folder, exist_ok=True)
+            url_model = (
+                "https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_"+clip_model+"_linear.pth?raw=true"
+            )
+            urlretrieve(url_model, path_to_model)
+        # Create appropriate linear layer
+        if clip_model == "vit_l_14":
+            m = nn.Linear(768, 1)
+        elif clip_model == "vit_b_32":
+            m = nn.Linear(512, 1)
+        else:
+            raise ValueError()
+        m.load_state_dict(torch.load(path_to_model))
+        m.eval()
+        return m
+    model, _, preprocess = open_clip.create_model_and_transforms('ViT-L-14', pretrained='openai')
+    amodel = get_aesthetic_model(clip_model="vit_l_14")
+    return model, preprocess, amodel
+def initialize_clip(device):
+    """Initialize the CLIP model and processor."""
+    model = AutoModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+    processor = AutoProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+    return model.to(device), processor
+def score_hpsv2_batch(model_dict, tokenizer, device, img_paths: list, prompts: list) -> list:
+    model = model_dict['model']
+    preprocess_val = model_dict['preprocess_val']
+    # 批量处理图片
+    images = [preprocess_val(Image.open(p)).unsqueeze(0)[:,:3,:,:] for p in img_paths]
+    images = torch.cat(images, dim=0).to(device=device)
+    texts = tokenizer(prompts).to(device=device)
+    with torch.no_grad():
+        outputs = model(images, texts)
+        image_features, text_features = outputs["image_features"], outputs["text_features"]
+        logits_per_image = image_features @ text_features.T
+        hps_scores = torch.diagonal(logits_per_image).cpu()
+    return hps_scores
+def score_pick_score_batch(prompts, images, model, processor, device):
+    # preprocess
+    pil_images = [Image.open(p) for p in images]
+    image_inputs = processor(
+        images=pil_images,
+        padding=True,
+        truncation=True,
+        max_length=77,
+        return_tensors="pt",
+    ).to(device)
+    text_inputs = processor(
+        text=prompts,
+        padding=True,
+        truncation=True,
+        max_length=77,
+        return_tensors="pt",
+    ).to(device)
+    with torch.no_grad():
+        # embed
+        image_embs = model.get_image_features(**image_inputs)
+        image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
+        text_embs = model.get_text_features(**text_inputs)
+        text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
+        # score
+        scores = model.logit_scale.exp() * (text_embs @ image_embs.T)
+        scores = torch.diagonal(scores).cpu()
+    return scores
+def score_aesthetic_batch(model, preprocess, aesthetic_model, device, img_paths: list) -> list:
+    """Scores a batch of images using the aesthetic model."""
+    images = [preprocess(Image.open(p)).unsqueeze(0) for p in img_paths]
+    images = torch.cat(images, dim=0).to(device=device)
+    with torch.no_grad():
+        feat = model.encode_image(images)
+        feat = feat / feat.norm(dim=-1, keepdim=True)
+        pred = aesthetic_model(feat).cpu()
+    return pred
+def score_clip_batch(model, processor, device, img_paths: list, prompts: list) -> list:
+    """Scores a batch of images against prompts using CLIP."""
+    # preprocess
+    pil_images = [Image.open(p) for p in img_paths]
+    image_inputs = processor(
+        images=pil_images,
+        padding=True,
+        truncation=True,
+        max_length=77,
+        return_tensors="pt",
+    ).to(device)
+    text_inputs = processor(
+        text=prompts,
+        padding=True,
+        truncation=True,
+        max_length=77,
+        return_tensors="pt",
+    ).to(device)
+    with torch.no_grad():
+        # embed
+        image_embs = model.get_image_features(**image_inputs)
+        image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
+        text_embs = model.get_text_features(**text_inputs)
+        text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
+        # score
+        scores = image_embs @ text_embs.T
+        scores = torch.diagonal(scores).cpu()
+    return scores
+def calculate_category_stats(data_dict):
+    """Calculate statistics for each category"""
+    stats = {}
+    for category, data_list in data_dict.items():
+        if not data_list:
+            stats[category] = {
+                'count': 0,
+                'mean': 0.0,
+                'std': 0.0,
+                'min': 0.0,
+                'max': 0.0
+            }
+            continue
+        rewards = [item['reward'] for item in data_list]
+        stats[category] = {
+            'count': len(rewards),
+            'mean': float(np.mean(rewards)),
+            'std': float(np.std(rewards)),
+            'min': float(np.min(rewards)),
+            'max': float(np.max(rewards))
+        }
+    total_mean = np.mean([stat['mean'] for stat in stats.values() if stat['count'] > 0])
+    stats['OVERALL'] = {
+        'count': sum(stat['count'] for stat in stats.values()),
+        'mean': float(total_mean),
+        'std': float(np.std([stat['mean'] for stat in stats.values() if stat['count'] > 0])),
+        'min': float(min(stat['min'] for stat in stats.values() if stat['count'] > 0)),
+        'max': float(max(stat['max'] for stat in stats.values() if stat['count'] > 0))
+    }
+    return stats
+def print_stats(stats):
+    print(f"{'Category':<30} {'Count':<8} {'Mean':<10} {'Std':<10} {'Min':<10} {'Max':<10}")
+    print("-" * 78)
+    for category, stat in stats.items():
+        category_name = category  # Get folder name only
+        print(f"{category_name:<30} {stat['count']:<8} {stat['mean']:<10.4f} {stat['std']:<10.4f} {stat['min']:<10.4f} {stat['max']:<10.4f}")
+    # Calculate overall statistics
+    if stats:
+        all_counts = [stat['count'] for stat in stats.values()]
+        all_means = [stat['mean'] for stat in stats.values() if stat['count'] > 0]
+        if all_means:
+            print("-" * 78)
+            print(f"{'OVERALL':<30} {sum(all_counts):<8} {np.mean(all_means):<10.4f} {'':<10} {min([stat['min'] for stat in stats.values() if stat['count'] > 0]):<10.4f} {max([stat['max'] for stat in stats.values() if stat['count'] > 0]):<10.4f}")
+def worker_process(process_id, process_dict, config_path, checkpoint_path, mode, device_id, dtype, batch_size, return_dict):
+    """Worker process function that processes a chunk of data"""
+    category_rewards = defaultdict(list)
+    device = f"cuda:{device_id}" if torch.cuda.is_available() else "cpu"
+    if mode == 'imagereward':
+        model = RM.load("ImageReward-v1.0")
+    elif mode == 'hpsv2':
+        inferencer = initialize_model_hpsv2(device, checkpoint_path)
+        model_dict, tokenizer = inferencer
+    elif mode == 'hpsv3':
+        inferencer = HPSv3RewardInferencer(config_path=config_path, checkpoint_path=checkpoint_path,device=device)
+    elif mode == 'pickscore':
+        model, processor = initialize_pickscore(device, checkpoint_path)
+    elif mode == 'aesthetic':
+        model, preprocess, aesthetic_model = initialize_aesthetic_model()
+        model = model.to(device)
+        aesthetic_model = aesthetic_model.to(device)
+    elif mode == 'clip':
+        model, processor = initialize_clip(device)
+        model = model.to(device)
+    else:
+        raise ValueError(f"Unsupported mode: {mode}")
+    for category, chunk_data in tqdm(process_dict.items(), total=len(process_dict), desc='Total', disable=not process_id == 0):
+        processed_data = []
+        # Process data in batches
+        for batch_start in tqdm(range(0, len(chunk_data), batch_size),
+                                total=(len(chunk_data) + batch_size - 1) // batch_size,
+                                desc=f"Category {category}", disable=not process_id == 0):
+            batch_end = min(batch_start + batch_size, len(chunk_data))
+            image_paths = chunk_data[batch_start:batch_end]
+            text_paths = [p[:-4]+'.txt' for p in image_paths]
+            prompts = ['\n'.join(open(p, 'r').readlines()) for p in text_paths]
+            with torch.no_grad():
+                if mode == 'imagereward':
+                    rewards = torch.tensor([model.score(prompt, image_path) for prompt, image_path in zip(prompts, image_paths)])
+                elif mode == 'hpsv2':
+                    rewards = score_hpsv2_batch(model_dict, tokenizer, device, image_paths, prompts)
+                elif mode == 'hpsv3':
+                    rewards = inferencer.reward(image_paths, prompts)
+                elif mode == 'pickscore':
+                    rewards = score_pick_score_batch(prompts, image_paths, model, processor, device)
+                elif mode == 'aesthetic':
+                    rewards = score_aesthetic_batch(model, preprocess, aesthetic_model, device, image_paths)
+                elif mode == 'clip':
+                    rewards = score_clip_batch(model, processor, device, image_paths, prompts)
+                else:
+                    raise ValueError(f"Unsupported mode: {mode}")
+            torch.cuda.empty_cache()
+            for i in range(len(image_paths)):
+                if rewards.ndim == 2:
+                    reward = rewards[i][0].item()
+                else:
+                    reward = rewards[i].item()
+                processed_data.append({
+                    'image_path': image_paths[i],
+                    'reward': reward,
+                    'prompt': prompts[i]
+                })
+        category_rewards[category] = processed_data
+    return_dict[process_id] = {
+        'data': category_rewards,
+    }
+def chunk_list(data_list, num_chunks):
+    """Split list into roughly equal chunks"""
+    chunk_size = len(data_list) // num_chunks
+    remainder = len(data_list) % num_chunks
+    chunks = []
+    start = 0
+    for i in range(num_chunks):
+        # Add one extra item to first 'remainder' chunks
+        current_chunk_size = chunk_size + (1 if i < remainder else 0)
+        end = start + current_chunk_size
+        chunks.append(data_list[start:end])
+        start = end
+    return chunks
+def main(config_path, checkpint_path, mode, image_folders, output_path, batch_size=16, num_processes=8, num_machines=1, machine_id=0):
+    print(f"Config path: {config_path}")
+    dtype = torch.bfloat16
+    # Gather all data first
+    folder_dict = {}
+    for folder in image_folders:
+        images = []
+        for ext in ['.png', '.jpg']:
+            images.extend(glob.glob(os.path.join(folder, "**", f"*{ext}"), recursive=True))
+        machine_image_chunks = chunk_list(images, num_machines)
+        image_list = machine_image_chunks[machine_id] if machine_id < len(machine_image_chunks) else []
+        print(f"Folder {folder} total data points: {len(image_list)}")
+        data_chunks = chunk_list(image_list, num_processes)
+        print(f"Folder {folder} data split into {num_processes} chunks with sizes: {[len(chunk) for chunk in data_chunks]}")
+        folder_dict[folder] = data_chunks
+    per_process_folder_dict = []
+    for i in range(num_processes):
+        one_dict = {}
+        for key, value in folder_dict.items():
+            one_dict[key] = value[i] if i < len(value) else []
+        per_process_folder_dict.append(one_dict)
+    # Create manager for shared data between processes
+    with mp.Manager() as manager:
+        return_dict = manager.dict()
+        processes = []
+        # Start processes
+        for i in range(num_processes):
+            device_id = i % torch.cuda.device_count() if torch.cuda.is_available() else 0
+            p = mp.Process(target=worker_process,
+                          args=(i, per_process_folder_dict[i], config_path, checkpint_path, mode, device_id, dtype, batch_size, return_dict))
+            p.start()
+            processes.append(p)
+        for p in processes:
+            p.join()
+        # Collect results from all processes
+        all_processed_data = {}
+        for i in range(num_processes):
+            if i in return_dict:
+                result = return_dict[i]
+                process_data = result['data']
+                # Merge data from each process
+                for category, data_list in process_data.items():
+                    if category not in all_processed_data:
+                        all_processed_data[category] = []
+                    all_processed_data[category].extend(data_list)
+            else:
+                print(f"No result from process {i}")
+        # Calculate and print statistics for current machine
+        if all_processed_data:
+            stats = calculate_category_stats(all_processed_data)
+            print(f"\n=== Machine {machine_id} Statistics ===")
+            print_stats(stats)
+    # Save results
+    if num_machines > 1:
+        # Save current machine's results
+        machine_output_path = output_path.replace('.json', f'_machine_{machine_id}.json')
+        with open(machine_output_path, "w") as f:
+            json.dump(all_processed_data, f, indent=4)
+        print(f"Machine {machine_id} results saved to {machine_output_path}")
+        # If this is machine 0, try to gather results from all machines
+        if machine_id == 0:
+            print("Waiting for all machines to complete...")
+            # Note: In practice, you might want to implement a proper synchronization mechanism
+            # For now, this assumes all machine files exist
+            final_result = {}
+            for i in range(num_machines):
+                machine_file = output_path.replace('.json', f'_machine_{i}.json')
+                if os.path.exists(machine_file):
+                    print(f"Loading results from machine {i}")
+                    with open(machine_file, 'r') as f:
+                        machine_data = json.load(f)
+                    # Merge machine data
+                    for category, data_list in machine_data.items():
+                        if category not in final_result:
+                            final_result[category] = []
+                        final_result[category].extend(data_list)
+                else:
+                    print(f"Warning: Machine {i} results file not found: {machine_file}")
+            # Calculate and print statistics for final results
+            stats = calculate_category_stats(final_result)
+            print("\n=== Final Combined Statistics ===")
+            print_stats(stats)
+            # Save final combined results with statistics
+            final_output = {
+                'statistics': stats,
+                'data': final_result,
+            }
+            with open(output_path, "w") as f:
+                json.dump(final_output, f, indent=4)
+            print(f"Final combined results saved to {output_path}")
+    else:
+        # Single machine case - calculate statistics
+        stats = calculate_category_stats(all_processed_data)
+        print("\n=== Statistics ===")
+        print_stats(stats)
+        # Save results with statistics
+        output_data = {
+            'statistics': stats,
+            'data': all_processed_data,
+        }
+        with open(output_path, "w") as f:
+            json.dump(output_data, f, indent=4)
+        print(f"Results saved to {output_path}")
+def parse_args():
+    parser = argparse.ArgumentParser(description='Process images with HPSv3 reward inference')
+    parser.add_argument('--config_path', type=str, help='Path to the configuration file')
+    parser.add_argument('--checkpoint_path', type=str, help='Path to the model checkpoint file')
+    parser.add_argument('--mode', type=str, choices=['imagereward','hpsv2', 'hpsv3', 'pickscore', 'aesthetic', 'clip'], default='hpsv3')
+    parser.add_argument('--image_folders', type=str, nargs='+', required=True, help='List of image folder paths to process')
+    parser.add_argument('--output_path', type=str, required=True, help='Path to save the output JSON file')
+    parser.add_argument('--batch_size', type=int, default=16, help='Batch size for processing (default: 16)')
+    parser.add_argument('--num_processes', type=int, default=8, help='Number of processes to use (default: 8)')
+    parser.add_argument('--num_machines', type=int, default=1, help='Total number of machines (default: 1)')
+    parser.add_argument('--machine_id', type=int, default=0, help='ID of current machine (default: 0)')
+    return parser.parse_args()
+if __name__ == "__main__":
+    mp.set_start_method('spawn', force=True)
+    args = parse_args()
+    main(
+        config_path=args.config_path,
+        checkpint_path=args.checkpoint_path,
+        mode=args.mode,
+        image_folders=args.image_folders,
+        output_path=args.output_path,
+        batch_size=args.batch_size,
+        num_processes=args.num_processes,
+        num_machines=args.num_machines,
+        machine_id=args.machine_id
+    )

evaluate/evaluate.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import json
+import torch
+import multiprocessing as mp
+from tqdm import tqdm
+from hpsv3.inference import HPSv3RewardInferencer
+from multiprocessing import Process, Queue
+import math
+import fire
+import prettytable
+def calc_rank_acc(score_sample, predict_sample):
+    tol_cnt = 0.
+    true_cnt = 0.
+    for idx in range(len(score_sample)):
+        item_base = score_sample[idx]["ranking"]
+        item = predict_sample[idx]["rewards"]
+        for i in range(len(item_base)):
+            for j in range(i+1, len(item_base)):
+                if item_base[i] > item_base[j]:
+                    if item[i] >= item[j]:
+                        tol_cnt += 1
+                    elif item[i] < item[j]:
+                        tol_cnt += 1
+                        true_cnt += 1
+                elif item_base[i] < item_base[j]:
+                    if item[i] > item[j]:
+                        tol_cnt += 1
+                        true_cnt += 1
+                    elif item[i] <= item[j]:
+                        tol_cnt += 1
+    return true_cnt / tol_cnt
+def worker_process(process_id, data_chunk, config_path, checkpoint_path, batch_size, result_queue, mode):
+    """
+    Worker function for each process to handle a chunk of data
+    """
+    # Each process uses a different GPU (cycle through available GPUs)
+    num_gpus = torch.cuda.device_count()
+    device = f"cuda:{process_id % num_gpus}" if num_gpus > 0 else "cpu"
+    dtype = torch.bfloat16
+    print(f"Process {process_id} starting with device {device}, processing {len(data_chunk)} items")
+    # Initialize model for this process
+    inferencer = HPSv3RewardInferencer(config_path, checkpoint_path, device=device, dtype=dtype)
+    process_correct = 0
+    process_equal = 0
+    process_results = []
+    for batch_start in tqdm(range(0, len(data_chunk), batch_size),
+                            total=(len(data_chunk) + batch_size - 1) // batch_size,
+                            desc=f"Process {process_id}"):
+        batch_end = min(batch_start + batch_size, len(data_chunk))
+        batch_info = data_chunk[batch_start:batch_end]
+        if mode == 'pair':
+            image_paths_1 = [info["path1"] for info in batch_info]
+            image_paths_2 = [info["path2"] for info in batch_info]
+            prompts = [info["prompt"] for info in batch_info]
+            with torch.no_grad():
+                rewards_1 = inferencer.reward(image_paths_1, prompts)
+                rewards_2 = inferencer.reward(image_paths_2, prompts)
+            for i in range(len(batch_info)):
+                info = batch_info[i]
+                if rewards_1.ndim == 2:
+                    reward_1, reward_2 = rewards_1[i][0].item(), rewards_2[i][0].item()
+                else:
+                    reward_1, reward_2 = rewards_1[i].item(), rewards_2[i].item()
+                item_result = {
+                    'reward_1': reward_1,
+                    'reward_2': reward_2,
+                    'correct': reward_1 > reward_2,
+                    'equal': reward_1 == reward_2,
+                    'info': info
+                }
+                process_results.append(item_result)
+                print(f"Process {process_id} - Reward 1: {reward_1}, Reward 2: {reward_2}")
+                if reward_1 > reward_2:
+                    process_correct += 1
+                if reward_1 == reward_2:
+                    process_equal += 1
+        elif mode == 'ranking':
+            for item in batch_info:
+                rewards =  inferencer.reward(item["generations"], item["prompt"])
+                predict_item = {
+                    "id": item["id"],
+                    "prompt": item["prompt"],
+                    "rewards": rewards
+                }
+                process_results.append(predict_item)
+    # Put results in queue
+    if mode == 'pair':
+        result_queue.put({
+            'process_id': process_id,
+            'correct': process_correct,
+            'equal': process_equal,
+            'total': len(data_chunk),
+            'results': process_results
+        })
+    elif mode == 'ranking':
+        result_queue.put({
+            'process_id': process_id,
+            'results': process_results
+        })
+    print(f"Process {process_id} completed: {process_correct}/{len(data_chunk)} correct, {process_equal}/{len(data_chunk)} equal")
+def main(test_json, config_path=None, batch_size=8, num_processes=8, checkpoint_path=None, mode='pair'):
+    assert mode in ['pair', 'ranking'], "Mode must be either 'pair' or 'ranking'"
+    assert checkpoint_path is not None, "Checkpoint path must be provided for inference"
+    mp.set_start_method('spawn', force=True)
+    info_list = json.load(open(test_json, "r"))
+    print(f"Total items to process: {len(info_list)}")
+    # Split data into chunks for each process
+    chunk_size = math.ceil(len(info_list) / num_processes)
+    data_chunks = []
+    for i in range(num_processes):
+        start_idx = i * chunk_size
+        end_idx = min((i + 1) * chunk_size, len(info_list))
+        if start_idx < len(info_list):
+            chunk = info_list[start_idx:end_idx]
+            data_chunks.append(chunk)
+            print(f"Process {i}: {len(chunk)} items (indices {start_idx}-{end_idx-1})")
+    # Ensure we have the right number of non-empty chunks
+    actual_processes = len(data_chunks)
+    print(f"Using {actual_processes} processes")
+    # Create result queue and processes
+    result_queue = Queue()
+    processes = []
+    print("Starting processes...")
+    for i in range(actual_processes):
+        p = Process(target=worker_process, args=(i, data_chunks[i], config_path, checkpoint_path, batch_size, result_queue, mode))
+        p.start()
+        processes.append(p)
+    # Wait for all processes to complete and collect results
+    all_results = []
+    total_correct = 0
+    total_equal = 0
+    total_items = 0
+    print("Waiting for processes to complete...")
+    for i in range(actual_processes):
+        result = result_queue.get()
+        all_results.append(result)
+        if mode == 'pair':
+            total_correct += result['correct']
+            total_equal += result['equal']
+            total_items += result['total']
+        print(f"Process {result['process_id']} finished: {result['correct']}/{result['total']} correct, {result['equal']}/{result['total']} equal")
+    # Wait for all processes to join
+    for p in processes:
+        p.join()
+    if mode == 'pair':
+        aggregated_results = {
+            'total_correct': total_correct,
+            'total_equal': total_equal,
+            'total_items': total_items,
+            'accuracy': total_correct / total_items,
+            'process_results': all_results
+        }
+        table = prettytable.PrettyTable()
+        table.field_names = ["Total Items", "Correct", "Equal", "Incorrect", "Accuracy (%)"]
+        incorrect = aggregated_results['total_items'] - aggregated_results['total_correct'] - aggregated_results['total_equal']
+        accuracy_percent = 100 * aggregated_results['total_correct'] / aggregated_results['total_items']
+        table.add_row([
+            aggregated_results['total_items'],
+            aggregated_results['total_correct'],
+            aggregated_results['total_equal'],
+            incorrect,
+            f"{accuracy_percent:.2f}"
+        ])
+    elif mode == 'ranking':
+        rank_acc = calc_rank_acc(info_list, all_results[0]['results'])
+        table = prettytable.PrettyTable()
+        table.field_names = ["Total Items", "Rank Accuracy (%)"]
+        table.add_row([len(info_list), f"{rank_acc * 100:.2f}"])
+    print(table)
+if __name__ == "__main__":
+    fire.Fire(main)

generate/README.md ADDED Viewed

	@@ -0,0 +1,104 @@

+# Image Generation Module
+This module is designed for generating images from text prompts using various pretrained diffusion models. It supports parallel generation across multiple GPUs and can be extended to include new models easily.
+## File Structure
+-   `gen_images_from_prompt.py`: The main script for running the image generation process. It reads prompts from a JSON file and handles command-line arguments.
+-   `generator.py`: Contains the core `Generator` class, which manages the model pipelines and distributes the generation tasks across different devices.
+-   `utils/pipelines.py`: Defines the configurations for all supported pretrained models. This is where you can add or modify model parameters.
+-   `utils/utils.py`: Contains helper functions for initializing `diffusers` pipelines and interacting with model APIs.
+## How to Use
+To generate images, run the main script with the required arguments.
+### Basic Command
+```bash
+python gen_images_from_prompt.py \
+    --json_path /path/to/your/prompts.json \
+    --out_dir /path/to/your/output_directory \
+    --pipeline_name sd_xl_pipe flux_schnell_pipe
+```
+### Command-Line Arguments
+-   `--json_path` (required): Path to a JSON file containing a list of prompts. Each item in the list should be an object with a `"caption"` key.
+    - For generating images according to real images, you should specify `"image_file"` which is the original image path, and `"aspect_ratio"` of this image. The specific height and width will be adjusted according to model's best practice resolution.
+    - For generating images from prompt only, you should specify `"save_name"`, `"height"` and `"width"`
+    **Example `prompts.json` format:**
+    ```json
+    [
+      {
+        "image_file": "1.jpg",
+        "caption": "A beautiful landscape painting of a mountain range at sunset.",
+        "aspect_ratio": 0.5,
+      },
+      {
+        "image_file": "2.jpg",
+        "caption": "A close-up photo of a red rose with water droplets.",
+        "aspect_ratio": 1.0,
+      },
+      {
+        "image_file": "3.jpg",
+        "caption": "An astronaut riding a horse on Mars, digital art.",
+        "aspect_ratio": 1.77,
+      }
+    ]
+    ```
+-   `--out_dir` (required): The root directory where generated images will be saved. A subdirectory will be created for each pipeline.
+-   `--pipeline_name` (required): One or more pipeline configuration names to use for generation. These names must correspond to the `PipelineParam` variable names defined in `utils/pipelines.py`.
+-   `--num_devices`: The number of GPU devices to use for generation. Defaults to `8`.
+-   `--batch_size`: The batch size per device. Defaults to `1`.
+-   `--num_machine`: The total number of machines used in a distributed setup. Defaults to `1`.
+-   `--machine_id`: The ID of the current machine in a distributed setup. Defaults to `0`.
+-   `--enable_availabel_check`: If set, the script will first run a quick check on a small batch to ensure each pipeline can be loaded and run without errors.
+-   `--reverse`: If set, the order of the specified pipelines will be reversed.
+## How to Add a New Model
+You can easily add a new text-to-image model by configuring it in the `utils/pipelines.py` file.
+1.  **Open `utils/pipelines.py`**.
+2.  **Import `PipelineParam`** if it's not already imported.
+3.  **Create a new `PipelineParam` instance** for your model. Define the following parameters:
+    -   `pipeline_name`: The model's path on the Hugging Face Hub or a local directory.
+    -   `generation_path`: The name of the subdirectory where the output images will be saved.
+    -   `pipeline_type`: The type of pipeline, e.g., `'t2i'` (text-to-image) or `'t2v'` (text-to-video). Defaults to `'t2i'`.
+    -   `pipe_init_kwargs`: A dictionary of arguments required for initializing the model pipeline (e.g., `{"torch_dtype": torch.float16}`).
+    -   `generation_kwargs`: A dictionary of arguments for the generation process (e.g., `{"guidance_scale": 7.0, "num_inference_steps": 28}`).
+    -   `base_resolution`: The base resolution the model was trained on (e.g., `1024`).
+    -   `force_aspect_ratio`: Optionally force a specific aspect ratio (e.g., `1` for square images).
+    **Example:**
+    ```python
+    from pydantic import BaseModel, Field
+    import torch
+    class PipelineParam(BaseModel):
+        # ... (class definition)
+    # Add your new model configuration
+    my_new_model_pipe = PipelineParam(
+            pipeline_name='organization/my-cool-model',
+            generation_path=f'generation/my_cool_model',
+            pipe_init_kwargs={
+                "torch_dtype": torch.float16,
+            },
+            base_resolution=1024,
+            generation_kwargs={
+                "guidance_scale": 5.0,
+                "num_inference_steps": 30,
+            }
+        )
+    ```
+4.  **Run the generation script** using the name of your new `PipelineParam` variable in the `--pipeline_name` argument.
+```bash
+python gen_images_from_prompt.py --pipeline_name my_new_model_pipe ...
+```

generate/__init__.py ADDED Viewed

File without changes

generate/gen_images_from_prompt.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from generator import Generator
+import json
+import os
+import torch
+import gc
+from utils.pipelines import *
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser(description="生成图片")
+    parser.add_argument(
+        "--json_path",
+        type=str,
+        help="json路径",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        help="输出目录",
+    )
+    parser.add_argument("--num_devices", type=int, default=8, help="设备数量")
+    parser.add_argument("--batch_size", type=int, default=1, help="批量大小")
+    parser.add_argument("--num_machine", type=int, default=1, help="机器数量")
+    parser.add_argument("--machine_id", type=int, default=0, help="机器id")
+    parser.add_argument(
+        "--pipeline_name", type=str, nargs="+", default=None, help="pipeline名称"
+    )
+    parser.add_argument("--enable_availabel_check", action="store_true")
+    parser.add_argument("--reverse", action="store_true")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    num_devices = args.num_devices
+    pipeline_params = [globals()[f"{name}_pipe"] for name in args.pipeline_name]
+    if args.reverse:
+        pipeline_params = pipeline_params[::-1]
+    # first check all pipeline
+    if args.enable_availabel_check:
+        print(f"Checking {len(pipeline_params)} pipelines")
+        for pipeline_param in pipeline_params:
+            generator = Generator(
+                pipe_name=pipeline_param.pipeline_name,
+                pipe_type=pipeline_param.pipeline_type,
+                pipe_init_kwargs=pipeline_param.pipe_init_kwargs,
+                num_devices=num_devices,
+            )
+            with open(args.json_path, "r") as f:
+                entries = json.load(f)
+            info_dict = entries[: args.batch_size]
+            generator.generate(
+                info_dict,
+                os.path.join(args.out_dir, pipeline_param.generation_path),
+                batch_size=args.batch_size,
+                num_processes=num_devices,
+                seed=42,
+                weight_dtype=pipeline_param.pipe_init_kwargs["torch_dtype"],
+                generation_kwargs=pipeline_param.generation_kwargs,
+                base_resolution=pipeline_param.base_resolution,
+                force_aspect_ratio=pipeline_param.force_aspect_ratio,
+            )
+            del generator
+            gc.collect()
+            torch.cuda.empty_cache()
+            print(f"Finished Checking {pipeline_param.pipeline_name}")
+    for pipeline_param in pipeline_params:
+        generator = Generator(
+            pipe_name=pipeline_param.pipeline_name,
+            pipe_type=pipeline_param.pipeline_type,
+            pipe_init_kwargs=pipeline_param.pipe_init_kwargs,
+            num_devices=num_devices,
+        )
+        with open(args.json_path, "r") as f:
+            entries = json.load(f)
+        for i in range(args.num_machine):
+            start_idx = i * len(entries) // args.num_machine
+            end_idx = (
+                (i + 1) * len(entries) // args.num_machine
+                if i != args.num_machine - 1
+                else len(entries)
+            )
+            if i == args.machine_id:
+                info_dict = entries[start_idx:end_idx]
+        info_dict = sorted(info_dict, key=lambda x: x["aspect_ratio"])
+        print(f"Generating {len(info_dict)} images")
+        generator.generate(
+            info_dict,
+            os.path.join(args.out_dir, pipeline_param.generation_path),
+            batch_size=args.batch_size,
+            num_processes=num_devices,
+            seed=42,
+            weight_dtype=pipeline_param.pipe_init_kwargs["torch_dtype"],
+            generation_kwargs=pipeline_param.generation_kwargs,
+            base_resolution=pipeline_param.base_resolution,
+            force_aspect_ratio=pipeline_param.force_aspect_ratio,
+        )
+        print(f"Finished generating {pipeline_param.pipeline_name}")
+        for pipeline in generator.pipelines:
+            pipeline.to("cpu")
+        del generator
+        torch.cuda.empty_cache()
+        gc.collect()
+if __name__ == "__main__":
+    main()

generate/generator.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import torch
+import os
+import inspect
+from PIL import Image
+from tqdm import tqdm
+from utils.utils import init_multiple_pipelines
+from concurrent.futures import ThreadPoolExecutor, as_completed
+Image.MAX_IMAGE_PIXELS = None
+class Generator:
+    def __init__(
+        self, pipe_name, pipe_type, pipe_init_kwargs, num_devices, device_id=None
+    ):
+        self.pipe_names = pipe_name
+        self.pipe_type = pipe_type
+        self.pipe_init_kwargs = pipe_init_kwargs
+        self.pipelines = init_multiple_pipelines(
+            pipe_name, pipe_init_kwargs, num_devices, device_id
+        )
+    def generate_imgs(
+        self,
+        num_device,
+        batch_size,
+        generation_path,
+        info_dict,
+        pipeline,
+        device_id,
+        weight_dtype,
+        seed,
+        base_resolution,
+        force_aspect_ratio,
+        generation_kwargs,
+    ):
+        torch.cuda.set_device(f"cuda:{device_id%num_device}")
+        device = torch.device(f"cuda:{device_id%num_device}")
+        num_prompts_per_device = len(info_dict) // num_device
+        start_idx = device_id * num_prompts_per_device
+        end_idx = (
+            start_idx + num_prompts_per_device
+            if device_id != (num_device - 1)
+            else len(info_dict)
+        )
+        device_info_dict = info_dict[start_idx:end_idx]
+        print(f"Device {device} generating for prompts {start_idx} to {end_idx-1}")
+        print("## Prepare generation dataset")
+        total_batches = len(device_info_dict) // batch_size + (
+            1 if len(device_info_dict) % batch_size != 0 else 0
+        )
+        for batch_idx in tqdm(
+            range(total_batches), desc="Pipeline: " + self.pipe_names
+        ):
+            batch_info_dict = device_info_dict[
+                batch_idx * batch_size : (batch_idx + 1) * batch_size
+            ]
+            save_paths = []
+            for info_dict in batch_info_dict:
+                if info_dict["image_file"] is not None:
+                    save_paths.append(
+                        os.path.join(generation_path, info_dict["image_file"][:-4] + ".png")
+                    )
+                else:
+                    save_paths.append(
+                        os.path.join(generation_path, info_dict["save_name"] + ".png")
+                    )
+            exists_idx = []
+            for i, save_path in enumerate(save_paths):
+                if os.path.exists(save_path):
+                    exists_idx.append(i)
+            batch_info_dict = [
+                batch_info_dict[i]
+                for i in range(len(batch_info_dict))
+                if i not in exists_idx
+            ]
+            if len(batch_info_dict) == 0:
+                continue
+            batch_prompts = [info_dict["caption"] for info_dict in batch_info_dict]
+            batch_image_file = [
+                info_dict["image_file"] for info_dict in batch_info_dict
+            ]
+            if batch_image_file[0] is not None:
+                try:
+                    batch_image_sizes = [
+                        Image.open(image_file).size for image_file in batch_image_file
+                    ]
+                except:
+                    batch_image_sizes = None
+            else:
+                batch_image_sizes = [
+                    (batch_info_dict[i]["width"], batch_info_dict[i]["height"])
+                    for i in range(len(batch_info_dict))
+                ]
+            if batch_image_sizes is None:
+                aspect_ratios = [
+                    info_dict["aspect_ratio"] for info_dict in batch_info_dict
+                ]
+            else:
+                aspect_ratios = [size[0] / size[1] for size in batch_image_sizes]
+            if force_aspect_ratio:
+                height = int(base_resolution / force_aspect_ratio // 64 * 64)
+                width = int(base_resolution * force_aspect_ratio // 64 * 64)
+            else:
+                # 根据aspect_ratios调整base_resolution, 得到height和width， 保证调整后的乘积大概等于base_resolution**2
+                height = int(base_resolution / aspect_ratios[0] ** (0.5) // 64 * 64)
+                width = int(base_resolution * aspect_ratios[0] ** (0.5) // 64 * 64)
+            generation_kwargs.update({"height": height, "width": width})
+            generator = torch.Generator().manual_seed(seed + batch_idx)
+            pipeline_signature = inspect.signature(pipeline)
+            pipeline_params = pipeline_signature.parameters.keys()
+            if 'height' not in pipeline_params:
+                generation_kwargs.pop('height', None)
+                print(f"Warning: Pipeline does not support 'height' parameter, removing from kwargs")
+            if 'width' not in pipeline_params:
+                generation_kwargs.pop('width', None)
+                print(f"Warning: Pipeline does not support 'width' parameter, removing from kwargs")
+            try:
+                outputs = pipeline(
+                    prompt=batch_prompts, generator=generator, **generation_kwargs
+                )
+            except Exception as e:
+                print(e)
+                continue
+            if self.pipe_type == "t2i":
+                images = outputs.images
+            elif self.pipe_type == "t2v":
+                images = outputs.frames[0]
+            for img_idx, (img, prompt, image_file, info_dict) in enumerate(
+                zip(images, batch_prompts, batch_image_file, batch_info_dict)
+            ):
+                if image_file is None:
+                    img_path = os.path.join(
+                        generation_path, info_dict["save_name"] + ".png"
+                    )
+                else:
+                    img_path = generation_path + image_file[:-4] + ".png"
+                if not os.path.exists(os.path.dirname(img_path)):
+                    os.makedirs(os.path.dirname(img_path), exist_ok=True)
+                img.save(img_path)
+                if image_file is None:
+                    text_path = os.path.join(
+                        generation_path, info_dict["save_name"] + ".txt"
+                    )
+                else:
+                    text_path = generation_path + image_file[:-4] + ".txt"
+                try:
+                    with open(text_path, "w") as f:
+                        f.write(prompt)
+                        f.write("\n")
+                        f.write(
+                            image_file
+                            if image_file is not None
+                            else info_dict["save_name"]
+                        )
+                except:
+                    pass
+        return True
+    def generate(
+        self,
+        info_dict,
+        generation_path,
+        num_processes,
+        batch_size,
+        weight_dtype,
+        seed,
+        generation_kwargs,
+        base_resolution,
+        force_aspect_ratio,
+    ):
+        with ThreadPoolExecutor(max_workers=num_processes) as executor:
+            futures = [
+                executor.submit(
+                    self.generate_imgs,
+                    num_processes,
+                    batch_size,
+                    generation_path,
+                    info_dict,
+                    self.pipelines[device_id],
+                    device_id,
+                    weight_dtype,
+                    seed,
+                    base_resolution,
+                    force_aspect_ratio,
+                    generation_kwargs,
+                )
+                for device_id in range(num_processes)
+            ]
+            for future in as_completed(futures):
+                print(f"Task completed: {future.result()}")

generate/utils/__init__.py ADDED Viewed

File without changes

generate/utils/pipelines.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import torch
+from pydantic import BaseModel, Field
+from typing import Optional, Dict, Any
+class PipelineParam(BaseModel):
+    pipeline_name: str
+    generation_path: str
+    pipeline_type: str = 't2i'
+    pipe_init_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    generation_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    base_resolution: int = 1024
+    force_aspect_ratio: Optional[int] = None
+flux_dev_pipe = PipelineParam(
+        pipeline_name='pretrained_models/FLUX.1-dev',
+        generation_path=f'generation/flux_dev',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 3.5,
+            "num_inference_steps": 28,
+            "max_sequence_length": 512,
+        }
+    )
+flux_schnell_pipe = PipelineParam(
+        pipeline_name='pretrained_models/FLUX.1-schnell',
+        generation_path=f'generation/flux_schnell',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 3.5,
+            "num_inference_steps": 4,
+        }
+    )
+sd3_medium_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-3-medium-diffusers',
+        generation_path=f'generation/sd3_medium',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 7.0,
+            "num_inference_steps": 28,
+        }
+    )
+sd_xl_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-xl-base-1.0',
+        generation_path=f'generation/sd_xl',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 5,
+            "num_inference_steps": 50,
+        }
+    )
+sd_1_5_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-v1-5',
+        generation_path=f'generation/sd_1_5',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        generation_kwargs={
+        }
+    )
+vq_diffusion_pipe = PipelineParam(
+        pipeline_name='pretrained_models/vq-diffusion-ithq',
+        generation_path=f'generation/vq_diffusion',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=256,
+        generation_kwargs={}
+    )
+sd_2_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-2',
+        generation_path=f'generation/sd_2',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+sd_1_1_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-v1-1',
+        generation_path=f'generation/sd_1_1',
+        pipe_init_kwargs={"torch_dtype": torch.float16,},
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+sd_1_4_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-v1-4',
+        generation_path=f'generation/sd_1_4',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+sd_2_1_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-2-1-base',
+        generation_path=f'generation/sd_2_1',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+openjourney_pipe = PipelineParam(
+        pipeline_name='pretrained_models/openjourney',
+        generation_path=f'generation/openjourney',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+playground_v2_5_pipe = PipelineParam(
+        pipeline_name='pretrained_models/playground-v2.5-1024px-aesthetic',
+        generation_path=f'generation/playground_v_2_5',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+    )
+versatile_pipe = PipelineParam(
+        pipeline_name='pretrained_models/versatile-diffusion',
+        generation_path=f'generation/versatile',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+glide_pipe = PipelineParam(
+        pipeline_name='pretrained_models/glide-base',
+        generation_path=f'generation/glide',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+)
+sd_3_5_medium_pipe = PipelineParam(
+        pipeline_name='stabilityai/stable-diffusion-3.5-medium',
+        generation_path=f'generation/sd_3_5_medium',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 40,
+            "guidance_scale": 4.5,
+        }
+    )
+sd_3_5_large_pipe = PipelineParam(
+        pipeline_name='stabilityai/stable-diffusion-3.5-large',
+        generation_path=f'generation/sd_3_5_large',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 28,
+            "guidance_scale": 3.5,
+        }
+    )
+kolors_pipe = PipelineParam(
+        pipeline_name='pretrained_models/Kolors-diffusers',
+        generation_path=f'generation/kolors',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+            'variant': 'fp16',
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 50,
+            "guidance_scale": 5.0,
+        }
+    )
+cogview4_pipe = PipelineParam(
+        pipeline_name='pretrained_models/CogView4-6B',
+        generation_path=f'generation/cogview4',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 50,
+            "guidance_scale": 3.5,
+        }
+    )
+pixart_sigma_pipe = PipelineParam(
+        pipeline_name='pretrained_models/PixArt-Sigma-XL-2-1024-MS',
+        generation_path=f'generation/pixart_sigma',
+        pipeline_type='t2i',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+)
+hunyuanvideo_pipe = PipelineParam(
+        pipeline_name='pretrained_models/hunyuanvideo_diffusers',
+        generation_path=f'generation/hunyuanvideo',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        pipeline_type='t2v',
+        generation_kwargs={
+            "num_inference_steps": 30,
+            "num_frames": 1,
+        }
+)
+hunyuandit_pipe = PipelineParam(
+        pipeline_name='pretrained_models/HunyuanDiT-v1.2-Diffusers',
+        generation_path=f'generation/hunyuandit',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+        pipeline_type='t2i',
+        generation_kwargs={
+        }
+)
+# API models
+# Fal.ai
+flux_pro_v1_1_ultr_pipe = PipelineParam(
+        pipeline_name='fal-ai/flux-pro/v1.1-ultra',
+        generation_path=f'generation/flux_pro_v1_1_ultra',
+        base_resolution=1024,
+        generation_kwargs={
+            "enable_safety_checker": False,
+            "num_images": 1,
+            # "aspect_ratio": "1:1",
+            "output_format": "jpeg",
+            "safety_tolerance": 5,
+        }
+    )
+recraftv3_pipe = PipelineParam(
+        pipeline_name='fal-ai/recraft-v3',
+        generation_path=f'generation/recraftv3',
+        base_resolution=1024,
+        generation_kwargs={
+            "enable_safety_checker": False,
+            "num_images": 1,
+            # "aspect_ratio": "1:1",
+            "output_format": "jpeg",
+            "safety_tolerance": 5,
+        }
+    )

generate/utils/utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+try:
+    import fal_client
+except:
+    fal_client = None
+from diffusers import AutoPipelineForText2Image, HunyuanVideoPipeline, DiffusionPipeline
+import json
+import diffusers
+from functools import partial
+import os
+# export FAL_KEY="YOUR_API_KEY"
+os.environ['FAL_KEY'] = 'YOUR_API_KEY'
+def init_multiple_pipelines(pipe_name, pipe_init_kwargs, num_devices, device_id=None):
+    pipelines_dict = []
+    if device_id is not None:
+        assert num_devices == 1
+    for i in range(num_devices):
+        actual_device_id = device_id if device_id is not None else i
+        try:
+            pipeline = AutoPipelineForText2Image.from_pretrained(pipe_name, **pipe_init_kwargs).to(f'cuda:{actual_device_id}')
+        except Exception as e:
+            # try:
+                config = json.load(open(os.path.join(pipe_name, 'model_index.json')))
+                class_name_str = config['_class_name']
+                pipeline_class = getattr(diffusers, class_name_str)
+                pipeline = pipeline_class.from_pretrained(pipe_name, **pipe_init_kwargs).to(f'cuda:{actual_device_id}')
+        # except Exception as ew:
+        #     print(e)
+        #     pipeline = DiffusionPipeline.from_pretrained(pipe_name, **pipe_init_kwargs).to(f'cuda:{actual_device_id}')
+        pipelines_dict.append(pipeline)
+    return pipelines_dict
+def init_pipeline_from_names(pipe_names, weight_dtype):
+    pipelines_dict = {}
+    for name in pipe_names:
+        pipeline = AutoPipelineForText2Image.from_pretrained(name, torch_dtype=weight_dtype)
+        pipelines_dict[name] = pipeline
+    return pipelines_dict
+def on_queue_update(update):
+    if isinstance(update, fal_client.InProgress):
+        for log in update.logs:
+           print(log["message"])
+def gen_with_api(pipe_names, generation_kwargs):
+    result = fal_client.subscribe(
+        pipe_names,
+        arguments=generation_kwargs,
+        with_logs=True,
+        on_queue_update=on_queue_update,
+    )
+    return result

hpsv3/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .inference import HPSv3RewardInferencer

hpsv3/cohp/__init__.py ADDED Viewed

File without changes

hpsv3/cohp/cohp_all.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import os
+from generator import Generator
+import json
+import os
+import torch
+import gc
+from utils_cohp.pipelines import *
+from utils_cohp.image2image_pipeline import Image2ImagePipeline
+import argparse
+from ..inference import HPSv3RewardInferencer
+import random
+from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
+import ImageReward as RM
+from PIL import Image
+from transformers import AutoProcessor, AutoModel
+def initialize_model(device, cp):
+    model_dict = {}
+    model, preprocess_train, preprocess_val = create_model_and_transforms(
+        'ViT-H-14',
+        'laion2B-s32B-b79K',
+        precision='amp',
+        device=device,
+        jit=False,
+        force_quick_gelu=False,
+        force_custom_text=False,
+        force_patch_dropout=False,
+        force_image_size=None,
+        pretrained_image=False,
+        image_mean=None,
+        image_std=None,
+        light_augmentation=True,
+        aug_cfg={},
+        output_dict=True,
+        with_score_predictor=False,
+        with_region_predictor=False
+    )
+    checkpoint = torch.load(cp, map_location=device, weights_only=False)
+    model.load_state_dict(checkpoint['state_dict'])
+    model = model.to(device)
+    model.eval()
+    tokenizer = get_tokenizer('ViT-H-14')
+    model_dict['model'] = model
+    model_dict['preprocess_val'] = preprocess_val
+    return model_dict, tokenizer
+def score_hpsv2_batch(model_dict, tokenizer, device, img_paths: list, prompts: list) -> list:
+    model = model_dict['model']
+    preprocess_val = model_dict['preprocess_val']
+    # 批量处理图片
+    images = [preprocess_val(Image.open(p)).unsqueeze(0) for p in img_paths]
+    images = torch.cat(images, dim=0).to(device=device)
+    texts = tokenizer(prompts).to(device=device)
+    with torch.no_grad():
+        outputs = model(images, texts)
+        image_features, text_features = outputs["image_features"], outputs["text_features"]
+        logits_per_image = image_features @ text_features.T
+        hps_scores = torch.diagonal(logits_per_image).cpu()
+    return hps_scores
+def pickscorecalc_probs(model,processor_pickscore,prompt, images, device):
+    # preprocess
+    image_inputs = processor_pickscore(
+        images=images,
+        padding=True,
+        truncation=True,
+        max_length=77,
+        return_tensors="pt",
+    ).to(device)
+    text_inputs = processor_pickscore(
+        text=prompt,
+        padding=True,
+        truncation=True,
+        max_length=77,
+        return_tensors="pt",
+    ).to(device)
+    with torch.no_grad():
+        # embed
+        image_embs = model.get_image_features(**image_inputs)
+        image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
+        text_embs = model.get_text_features(**text_inputs)
+        text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
+        # score
+        scores = text_embs @ image_embs.T
+    return scores
+def generate_images(reward_type, prompt, index, pipeline_params, di_pipeline, inferencer, out_dir='cohp_output', num_rounds=5, strength=0.8, device='cuda:1'):
+    os.makedirs(out_dir, exist_ok=True)
+    os.makedirs(os.path.join(out_dir, 'result_json'), exist_ok=True)
+    batch_size = 2  # 设置batch大小
+    results = []  # 用于保存每个 prompt 的最终结果
+    info_dict = {
+        'caption': prompt,
+        'width': 1024,
+        'height': 1024,
+        'aspect_ratio': 1,
+        'save_name': f"{index}_origin",
+    }
+    di_score_pipelines = {}  # 用于存储 pipeline 的平均分数
+    # 中间结果记录结构：用于保存每一轮图像路径和分数
+    intermediate_results_sample_preference = []
+    intermediate_results_model_preference = []
+    # 遍历 pipeline 参数
+    for pipeline_param in pipeline_params:
+        name = di_pipeline[pipeline_param]
+        generator = Generator(
+            device = device,
+            pipe_name=pipeline_param.pipeline_name,
+            pipe_type=pipeline_param.pipeline_type,
+            pipe_init_kwargs=pipeline_param.pipe_init_kwargs,
+        )
+        image_paths = generator.generate_imgs(
+            info_dict = info_dict,
+            generation_path = os.path.join(out_dir, pipeline_param.generation_path),
+            batch_size=batch_size,
+            device = device,
+            seed=random.randint(0, 75859066837),
+            weight_dtype=pipeline_param.pipe_init_kwargs["torch_dtype"],
+            generation_kwargs=pipeline_param.generation_kwargs,
+        )
+        # 对生成的图像进行评分
+        score_list = []
+        for image_path in image_paths:
+            if reward_type == 'hpsv2':
+                score = score_hpsv2_batch(model_dict, tokenizer, device, [image_path], [prompt])
+                score = score.item()
+            elif reward_type == 'hpsv3':
+                score = inferencer.reward([image_path], [prompt]).cpu().detach()
+                score = score[0][0].item()
+            elif reward_type == 'imagereward':
+                score = inferencer.score(prompt, [image_path])
+            elif reward_type == 'pickscore':
+                score = pickscorecalc_probs(inferencer, processor_pickscore, prompt, [Image.open(image_path)],device)[0][0].item()
+                print(f"PickScore for {image_path}: {score}")
+            else:
+                raise ValueError("Unsupported reward type. Choose 'hpsv2', 'hpsv3', or 'imagereward'.")
+            score_list.append(score)
+        average = sum(score_list) / len(score_list)
+        di_score_pipelines[name] = average
+        # 保存中间步骤的图像路径和分数
+        intermediate_results_model_preference.append({
+            'pipeline': name,
+            'image_paths': image_paths,  # 所有生成的图片路径
+            'scores': score_list,  # 每轮的得分列表
+            'max_image_path': image_paths[score_list.index(max(score_list))],  # 当前轮得分最高的图片路径
+            'max_score': max(score_list)  # 当前轮得分最高的分数
+        })
+        # 清理生成器资源
+        generator.pipelines.to("cpu")
+        del generator
+        torch.cuda.empty_cache()
+        gc.collect()
+    # 选择得分最高的 pipeline 和对应的图片
+    max_key = max(di_score_pipelines, key=di_score_pipelines.get)
+    max_index = score_list.index(max(score_list))
+    image_path_chosen = image_paths[max_index]  # 首轮选择的最佳图片
+    # 多轮优化循环
+    for round_num in range(num_rounds):
+        if round_num == 3 or round_num == 4:
+            strength = 0.5
+        i2ipipeline = Image2ImagePipeline(max_key)
+        images = i2ipipeline.generate_image(
+            prompt=prompt,
+            image_path=image_path_chosen,
+            strength=strength,
+            batch_size=4,
+            save_prefix=f'{index}_{max_key}_image2image_round{round_num + 1}',
+            output_dir=out_dir
+        )
+        score_list = []
+        for image_path in images:
+            if reward_type == 'hpsv2':
+                score = score_hpsv2_batch(model_dict, tokenizer, device, [image_path], [prompt])
+                score = score.item()
+            elif reward_type == 'hpsv3':
+                score = inferencer.reward([image_path], [prompt]).cpu().detach()
+                score = score[0][0].item()
+            elif reward_type == 'imagereward':
+                score = inferencer.score(prompt, [image_path])
+            elif reward_type == 'pickscore':
+                score = pickscorecalc_probs(inferencer, processor_pickscore, prompt, [Image.open(image_path)],device)[0][0].item()
+                print(f"PickScore for {image_path}: {score}")
+            else:
+                raise ValueError("Unsupported reward type. Choose 'hpsv2', 'hpsv3', or 'imagereward'.")
+            score_list.append(score)
+        intermediate_results_sample_preference.append({
+            'round': round_num + 1,
+            'image_paths': images,  # 所有生成的图片路径
+            'scores': score_list,  # 每轮的得分列表
+            'max_image_path': images[score_list.index(max(score_list))],  # 当前轮得分最高的图片路径
+            'max_score': max(score_list)  # 当前轮得分最高的分数
+        })
+        # 更新图片选择
+        max_index = score_list.index(max(score_list))
+        image_path_chosen = images[max_index]
+    # 最终结果保存
+    results.append({
+        'prompt': prompt,
+        'model_preference_image_chosen': image_path_chosen,
+        "model_preference_info": intermediate_results_model_preference,  # 包含所有中间结果
+        'best_image_path': image_path_chosen,
+        'best_model': max_key,
+        'score': max(score_list),
+        'sample_preference_intermediate_results': intermediate_results_sample_preference,  # 包含所有中间结果
+    })
+    with open(os.path.join(out_dir, 'result_json',f'{index}.json'),'w',encoding='utf-8') as f:
+        json.dump(results,f,ensure_ascii=False, indent=4)
+    return results
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image Generation Script")
+    parser.add_argument('--prompt', type=str, required=True, help='The prompt for image generation')
+    parser.add_argument('--index', type=str, required=True, help='Index for saving results')
+    parser.add_argument('--device', type=str, default='cuda:1', help='Device to run the model on')
+    parser.add_argument('--reward_model', type=str, default='hpsv3', help='Reward model to use (hpsv2 or hpsv3 or pickscore or imagereward)')
+    args = parser.parse_args()
+    output_dir = f"cohp_output_{args.reward_model}"
+    os.makedirs(output_dir,exist_ok=True)
+    if args.reward_model == 'hpsv2':
+        inferencer = initialize_model(args.device, 'pretrained_models/HPS_v2.1_compressed.pt')
+        model_dict, tokenizer = inferencer
+    elif args.reward_model == 'hpsv3':
+        dtype = torch.bfloat16
+        inferencer = HPSv3RewardInferencer(device=args.device, dtype=dtype)
+    elif args.reward_model == 'imagereward':
+        inferencer = RM.load("ImageReward-v1.0").to(args.device)
+    elif args.reward_model == 'pickscore':
+        processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+        model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1"
+        processor_pickscore = AutoProcessor.from_pretrained(processor_name_or_path)
+        inferencer = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(args.device)
+    else:
+        raise ValueError("Unsupported reward model. Choose 'hpsv2', 'hpsv3', or 'imagereward'.")
+    pipeline_params = [
+                flux_dev_pipe,
+                kolors_pipe,
+                sd3_medium_pipe,
+                playground_v2_5_pipe
+            ]
+    di_score_pipelines={}
+    di_pipeline = {
+        flux_dev_pipe:'flux',
+        kolors_pipe:'kolors',
+        sd3_medium_pipe:'sd3',
+        playground_v2_5_pipe:'playground_v2_5'
+    }
+    results = generate_images(
+            args.reward_model,
+            args.prompt,
+            args.index,
+            pipeline_params,
+            di_pipeline,
+            inferencer,
+            out_dir=output_dir,
+            num_rounds=4,
+            strength=0.8,
+            device=args.device)

hpsv3/cohp/generator.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import os
+import inspect
+from PIL import Image
+from tqdm import tqdm
+from utils_cohp.utils import init_pipelines
+Image.MAX_IMAGE_PIXELS = None
+class Generator:
+    def __init__(
+        self, pipe_name, pipe_type, pipe_init_kwargs, device=None
+    ):
+        self.pipe_names = pipe_name
+        self.pipe_type = pipe_type
+        self.pipe_init_kwargs = pipe_init_kwargs
+        self.pipelines = init_pipelines(
+            pipe_name, pipe_init_kwargs, device
+        )
+    def generate_imgs(
+        self,
+        batch_size,
+        generation_path,
+        info_dict,
+        device,
+        weight_dtype,
+        seed,
+        generation_kwargs,
+    ):
+        torch.cuda.set_device(device)
+        device = torch.device(device)
+        generator = torch.Generator().manual_seed(seed)
+        pipeline_signature = inspect.signature(self.pipelines)
+        pipeline_params = pipeline_signature.parameters.keys()
+        if 'height' not in pipeline_params:
+            generation_kwargs.pop('height', None)
+            print(f"Warning: Pipeline does not support 'height' parameter, removing from kwargs")
+        if 'width' not in pipeline_params:
+            generation_kwargs.pop('width', None)
+            print(f"Warning: Pipeline does not support 'width' parameter, removing from kwargs")
+        outputs = self.pipelines(
+            prompt=info_dict['caption'], generator=generator,num_images_per_prompt = batch_size, **generation_kwargs
+        )
+        if self.pipe_type == "t2i":
+            images = outputs.images
+        elif self.pipe_type == "t2v":
+            images = outputs.frames[0]
+        image_paths = []
+        for idx, image in enumerate(images):
+            img_path = os.path.join(
+                generation_path, info_dict["save_name"] + f"_{idx}.png"
+            )
+            os.makedirs(generation_path,exist_ok=True)
+            image.save(img_path)
+            image_paths.append(img_path)
+        return image_paths

hpsv3/cohp/run_cohp.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+import json
+import random
+import gc
+import argparse
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModel
+from generator import Generator
+from hpsv3.inference import HPSv3RewardInferencer
+from hpsv3.cohp.utils_cohp.pipelines import *
+from hpsv3.cohp.utils_cohp.image2image_pipeline import Image2ImagePipeline
+try:
+    from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
+except:
+    print("HPSv2 model not found, skipping HPSv2 related imports.")
+try:
+    import ImageReward as RM
+except:
+    print("ImageReward module not found, skipping ImageReward related imports.")
+def initialize_hpsv2_model(device, checkpoint_path):
+    model_dict = {}
+    model, _, preprocess_val = create_model_and_transforms(
+        'ViT-H-14',
+        'laion2B-s32B-b79K',
+        device=device,
+        precision='amp',
+        pretrained_image=False,
+        output_dict=True,
+    )
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    model.load_state_dict(checkpoint['state_dict'])
+    model = model.to(device).eval()
+    tokenizer = get_tokenizer('ViT-H-14')
+    model_dict['model'] = model
+    model_dict['preprocess_val'] = preprocess_val
+    return model_dict, tokenizer
+def score_hpsv2(model_dict, tokenizer, device, img_paths, prompts):
+    model = model_dict['model']
+    preprocess_val = model_dict['preprocess_val']
+    images = [preprocess_val(Image.open(p)).unsqueeze(0) for p in img_paths]
+    images = torch.cat(images, dim=0).to(device)
+    texts = tokenizer(prompts).to(device)
+    with torch.no_grad():
+        outputs = model(images, texts)
+        image_features, text_features = outputs["image_features"], outputs["text_features"]
+        logits_per_image = image_features @ text_features.T
+        hps_scores = torch.diagonal(logits_per_image).cpu()
+    return hps_scores
+def calculate_pickscore_probs(model, processor, prompt, images, device):
+    image_inputs = processor(images=images, padding=True, return_tensors="pt").to(device)
+    text_inputs = processor(text=prompt, padding=True, return_tensors="pt").to(device)
+    with torch.no_grad():
+        image_embs = model.get_image_features(**image_inputs)
+        image_embs /= torch.norm(image_embs, dim=-1, keepdim=True)
+        text_embs = model.get_text_features(**text_inputs)
+        text_embs /= torch.norm(text_embs, dim=-1, keepdim=True)
+        scores = text_embs @ image_embs.T
+    return scores
+def generate_images(
+    reward_type, prompt, index, pipeline_params, pipelines_mapping, inferencer,
+    output_dir='cohp_output', num_rounds=5, strength=0.8, device='cuda:1'
+):
+    os.makedirs(output_dir, exist_ok=True)
+    result_json_dir = os.path.join(output_dir, 'result_json')
+    os.makedirs(result_json_dir, exist_ok=True)
+    info_dict = {
+        'caption': prompt,
+        'width': 1024,
+        'height': 1024,
+        'aspect_ratio': 1,
+        'save_name': f"{index}_origin",
+    }
+    di_score_pipelines = {}
+    intermediate_results_model_pref = {}
+    intermediate_results_sample_pref = {}
+    max_final_score = 0
+    for pipeline_param in pipeline_params:
+        generator = Generator(
+            device=device,
+            pipe_name=pipeline_param.pipeline_name,
+            pipe_type=pipeline_param.pipeline_type,
+            pipe_init_kwargs=pipeline_param.pipe_init_kwargs,
+        )
+        image_paths = generator.generate_imgs(
+            info_dict=info_dict,
+            generation_path=os.path.join(output_dir, pipeline_param.generation_path),
+            batch_size=2,
+            device=device,
+            seed=random.randint(0, 75859066837),
+            weight_dtype=pipeline_param.pipe_init_kwargs["torch_dtype"],
+            generation_kwargs=pipeline_param.generation_kwargs
+        )
+        score_list = []
+        for image_path in image_paths:
+            if reward_type == 'hpsv2':
+                score = score_hpsv2(model_dict, tokenizer, device, [image_path], [prompt]).item()
+            elif reward_type == 'hpsv3':
+                score = inferencer.reward([image_path], [prompt]).cpu().detach()[0][0].item()
+            elif reward_type == 'imagereward':
+                score = inferencer.score(prompt, [image_path])
+            elif reward_type == 'pickscore':
+                score = calculate_pickscore_probs(inferencer, processor_pickscore, prompt, [Image.open(image_path)], device)[0][0].item()
+            else:
+                raise ValueError(f"Unsupported reward type: {reward_type}")
+            score_list.append(score)
+        average_score = sum(score_list) / len(score_list)
+        pipeline_name = pipelines_mapping[pipeline_param]
+        di_score_pipelines[pipeline_name] = average_score
+        intermediate_results_model_pref[pipeline_name] = {
+            'image_paths': image_paths,
+            'scores': score_list,
+            'max_image_path': image_paths[score_list.index(max(score_list))],
+            'max_score': max(score_list),
+        }
+        generator.pipelines.to("cpu")
+        del generator
+        torch.cuda.empty_cache()
+        gc.collect()
+    # Select the best pipeline based on scores
+    best_pipeline = max(di_score_pipelines, key=di_score_pipelines.get)
+    best_pipeline_results = intermediate_results_model_pref[best_pipeline]
+    chosen_image_path = best_pipeline_results['max_image_path']
+    # Refinement with Image2ImagePipeline
+    i2ipipeline = Image2ImagePipeline(best_pipeline)
+    for round_num in range(num_rounds):
+        if round_num in [3, 4]:
+            strength = 0.5
+        images = i2ipipeline.generate_image(
+            prompt=prompt,
+            image_path=chosen_image_path,
+            strength=strength,
+            batch_size=4,
+            save_prefix=f'{index}_{best_pipeline}_image2image_round{round_num + 1}',
+            output_dir=output_dir,
+        )
+        score_list = []
+        for image_path in images:
+            if reward_type == 'hpsv2':
+                score = score_hpsv2(model_dict, tokenizer, device, [image_path], [prompt]).item()
+            elif reward_type == 'hpsv3':
+                score = inferencer.reward([image_path], [prompt]).cpu().detach()[0][0].item()
+            elif reward_type == 'imagereward':
+                score = inferencer.score(prompt, [image_path])
+            elif reward_type == 'pickscore':
+                score = calculate_pickscore_probs(inferencer, processor_pickscore, prompt, [Image.open(image_path)], device)[0][0].item()
+            else:
+                raise ValueError(f"Unsupported reward type: {reward_type}")
+            score_list.append(score)
+        # Update intermediate results
+        intermediate_results_sample_pref[round_num + 1] = {
+            'image_paths': images,
+            'scores': score_list,
+            'max_image_path': images[score_list.index(max(score_list))],
+            'max_score': max(score_list),
+        }
+        # Determine best image during refinement
+        if max(score_list) > max_final_score:
+            max_final_score = max(score_list)
+            chosen_image_path = images[score_list.index(max(score_list))]
+    # Save final results
+    results = {
+        'prompt': prompt,
+        'best_model': best_pipeline,
+        'final_image_path': chosen_image_path,
+        'model_preference_info': intermediate_results_model_pref,
+        'sample_preference_intermediate_results': intermediate_results_sample_pref,
+    }
+    with open(os.path.join(result_json_dir, f'{index}.json'), 'w', encoding='utf-8') as file:
+        json.dump(results, file, ensure_ascii=False, indent=4)
+    return results
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Image Generation Script")
+    parser.add_argument('--prompt', type=str, required=True, help='The prompt for image generation')
+    parser.add_argument('--index', type=str, required=True, help='Index for saving results')
+    parser.add_argument('--device', type=str, default='cuda:1', help='Device to run the model on')
+    parser.add_argument('--reward_model', type=str, default='hpsv3', help='Reward model to use (hpsv2, hpsv3, pickscore, or imagereward)')
+    args = parser.parse_args()
+    # Initialize models and pipelines
+    output_dir = f"cohp_output_{args.reward_model}"
+    if args.reward_model == 'hpsv2':
+        model_dict, tokenizer = initialize_hpsv2_model(args.device, 'pretrained_models/HPS_v2.1_compressed.pt')
+        inferencer = model_dict
+    elif args.reward_model == 'hpsv3':
+        inferencer = HPSv3RewardInferencer(device=args.device)
+    elif args.reward_model == 'imagereward':
+        inferencer = RM.load("ImageReward-v1.0").to(args.device)
+    elif args.reward_model == 'pickscore':
+        processor_pickscore = AutoProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
+        inferencer = AutoModel.from_pretrained("yuvalkirstain/PickScore_v1").eval().to(args.device)
+    else:
+        raise ValueError("Unsupported reward model.")
+    # Define pipelines
+    pipeline_params = [kolors_pipe, sd3_medium_pipe, playground_v2_5_pipe, flux_dev_pipe]
+    pipelines_mapping = {
+        flux_dev_pipe: 'flux',
+        kolors_pipe: 'kolors',
+        sd3_medium_pipe: 'sd3',
+        playground_v2_5_pipe: 'playground_v2_5',
+    }
+    # Generate images
+    results = generate_images(
+        reward_type=args.reward_model,
+        prompt=args.prompt,
+        index=args.index,
+        pipeline_params=pipeline_params,
+        pipelines_mapping=pipelines_mapping,
+        inferencer=inferencer,
+        output_dir=output_dir,
+        num_rounds=4,
+    )

hpsv3/cohp/utils_cohp/__init__.py ADDED Viewed

File without changes

hpsv3/cohp/utils_cohp/image2image_pipeline.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from diffusers import FluxImg2ImgPipeline, KolorsImg2ImgPipeline, StableDiffusion3Img2ImgPipeline, StableDiffusionXLImg2ImgPipeline
+from diffusers.utils import load_image
+import torch
+import os
+class Image2ImagePipeline:
+    def __init__(
+        self, pipe_name, device='cuda'
+    ):
+        self.pipe_name = pipe_name
+        if self.pipe_name == 'flux':
+            self.pipeline = FluxImg2ImgPipeline.from_pretrained("pretrained_models/FLUX.1-dev",torch_dtype=torch.bfloat16).to(device)
+            self.generation_path = 'generation/flux_dev',
+        elif self.pipe_name == 'kolors':
+            self.pipeline = KolorsImg2ImgPipeline.from_pretrained("/preflab/shuiyunhao/tasks/HPSv3/pretrained_models/kolors",torch_dtype=torch.bfloat16).to(device)
+            self.generation_path = 'generation/kolors',
+        elif self.pipe_name == 'sd3':
+            self.pipeline = StableDiffusion3Img2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium",torch_dtype=torch.bfloat16).to(device)
+            self.generation_path = 'generation/sd3_medium',
+        elif self.pipe_name == 'playground_v2_5':
+            self.pipeline = StableDiffusionXLImg2ImgPipeline.from_pretrained("pretrained_models/playground-v2.5-1024px-aesthetic",torch_dtype=torch.bfloat16).to(device)
+            self.generation_path = 'generation/playground_v_2_5',
+        self.pipeline = self.pipeline.to(torch.bfloat16)
+    def generate_image(
+        self,
+        prompt,
+        image_path,
+        strength,
+        batch_size,
+        save_prefix,
+        output_dir
+    ):
+        image_load = load_image(image_path)
+        if self.pipe_name == 'flux':
+            images = self.pipeline(
+            prompt = prompt,
+            image=image_load,
+            num_images_per_prompt=batch_size,
+            strength = strength).images
+        else:
+            images = self.pipeline(
+                prompt = prompt,
+                negative_prompt = '',
+                image=image_load,
+                num_images_per_prompt=batch_size,
+                strength = strength).images
+        image_list = []
+        for ind,img in enumerate(images):
+            print(output_dir,self.generation_path,save_prefix)
+            save_path = os.path.join(output_dir,self.generation_path[0],save_prefix+f'_{ind}.png')
+            image_list.append(save_path)
+            img.save(save_path)
+        print(image_list)
+        return image_list
+# pipeline = StableDiffusion3Img2ImgPipeline.from_pretrained("/preflab/shuiyunhao/tasks/HPSv3/pretrained_models/stable-diffusion-3-medium-diffusers",torch_dtype=torch.bfloat16).to('cuda:0')
+# pipeline = pipeline.to(torch.bfloat16)
+# image_load = load_image('/preflab/shuiyunhao/tasks/HPSv3/cohp_output/generation/flux_dev/0_origin_0.png')
+# images = pipeline(
+#             prompt = 'a girl',
+#             negative_prompt = '',
+#             image=image_load,
+#             num_images_per_prompt=1,
+#             strength = 0.8).images

hpsv3/cohp/utils_cohp/pipelines.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import torch
+class PipelineParam:
+    pipeline_name: str
+    pipeline_type: str
+    generation_path: str
+    pipe_init_kwargs: dict
+    generation_kwargs: dict
+    base_resolution: int
+    force_aspect_ratio: int
+    def __init__(self, pipeline_name: str, generation_path: str, pipeline_type = 't2i',
+                 pipe_init_kwargs: dict = None, generation_kwargs: dict = None,
+                 base_resolution: int = 1024, force_aspect_ratio: int = None):
+        self.pipeline_name = pipeline_name
+        self.pipeline_type = pipeline_type
+        self.generation_path = generation_path
+        self.pipe_init_kwargs = pipe_init_kwargs if pipe_init_kwargs is not None else {}
+        self.generation_kwargs = generation_kwargs if generation_kwargs is not None else {}
+        self.base_resolution = base_resolution
+        self.force_aspect_ratio = force_aspect_ratio
+flux_dev_pipe = PipelineParam(
+        pipeline_name='pretrained_models/FLUX.1-dev',
+        generation_path=f'generation/flux_dev',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 3.5,
+            "num_inference_steps": 28,
+            "max_sequence_length": 512,
+        }
+    )
+flux_schnell_pipe = PipelineParam(
+        pipeline_name='/mnt2/share/huggingface_models/FLUX.1-schnell',
+        generation_path=f'generation/flux_schnell',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 3.5,
+            "num_inference_steps": 4,
+        }
+    )
+sd3_medium_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-3-medium-diffusers',
+        generation_path=f'generation/sd3_medium',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 7.0,
+            "num_inference_steps": 28,
+        }
+    )
+sd_xl_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-xl-base-1.0',
+        generation_path=f'generation/sd_xl',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "guidance_scale": 5,
+            "num_inference_steps": 50,
+        }
+    )
+sd_1_5_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-v1-5',
+        generation_path=f'generation/sd_1_5',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        generation_kwargs={
+        }
+    )
+vq_diffusion_pipe = PipelineParam(
+        pipeline_name='pretrained_models/vq-diffusion-ithq',
+        generation_path=f'generation/vq_diffusion',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=256,
+        generation_kwargs={}
+    )
+sd_2_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-2',
+        generation_path=f'generation/sd_2',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+sd_1_1_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-v1-1',
+        generation_path=f'generation/sd_1_1',
+        pipe_init_kwargs={"torch_dtype": torch.float16,},
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+sd_1_4_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-v1-4',
+        generation_path=f'generation/sd_1_4',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+sd_2_1_pipe = PipelineParam(
+        pipeline_name='pretrained_models/stable-diffusion-2-1-base',
+        generation_path=f'generation/sd_2_1',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+openjourney_pipe = PipelineParam(
+        pipeline_name='pretrained_models/openjourney',
+        generation_path=f'generation/openjourney',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+playground_v2_5_pipe = PipelineParam(
+        pipeline_name='pretrained_models/playground-v2.5-1024px-aesthetic',
+        generation_path=f'generation/playground_v_2_5',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+    )
+versatile_pipe = PipelineParam(
+        pipeline_name='pretrained_models/versatile-diffusion',
+        generation_path=f'generation/versatile',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+    )
+glide_pipe = PipelineParam(
+        pipeline_name='pretrained_models/glide-base',
+        generation_path=f'generation/glide',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=512,
+        force_aspect_ratio=1,
+)
+sd_3_5_medium_pipe = PipelineParam(
+        pipeline_name='stabilityai/stable-diffusion-3.5-medium',
+        generation_path=f'generation/sd_3_5_medium',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 40,
+            "guidance_scale": 4.5,
+        }
+    )
+sd_3_5_large_pipe = PipelineParam(
+        pipeline_name='stabilityai/stable-diffusion-3.5-large',
+        generation_path=f'generation/sd_3_5_large',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 28,
+            "guidance_scale": 3.5,
+        }
+    )
+kolors_pipe = PipelineParam(
+        pipeline_name='pretrained_models/Kolors-diffusers',
+        generation_path=f'generation/kolors',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+            'variant': 'fp16',
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 50,
+            "guidance_scale": 5.0,
+        }
+    )
+cogview4_pipe = PipelineParam(
+        pipeline_name='pretrained_models/CogView4-6B',
+        generation_path=f'generation/cogview4',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        generation_kwargs={
+            "num_inference_steps": 50,
+            "guidance_scale": 3.5,
+        }
+    )
+pixart_sigma_pipe = PipelineParam(
+        pipeline_name='pretrained_models/PixArt-Sigma-XL-2-1024-MS',
+        generation_path=f'generation/pixart_sigma',
+        pipeline_type='t2i',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+)
+hunyuanvideo_pipe = PipelineParam(
+        pipeline_name='pretrained_models/hunyuanvideo_diffusers',
+        generation_path=f'generation/hunyuanvideo',
+        pipe_init_kwargs={
+            "torch_dtype": torch.bfloat16,
+        },
+        base_resolution=1024,
+        pipeline_type='t2v',
+        generation_kwargs={
+            "num_inference_steps": 30,
+            "num_frames": 1,
+        }
+)
+hunyuandit_pipe = PipelineParam(
+        pipeline_name='pretrained_models/HunyuanDiT-v1.2-Diffusers',
+        generation_path=f'generation/hunyuandit',
+        pipe_init_kwargs={
+            "torch_dtype": torch.float16,
+        },
+        base_resolution=1024,
+        pipeline_type='t2i',
+        generation_kwargs={
+        }
+)
+# API models
+# Fal.ai
+flux_pro_v1_1_ultr_pipe = PipelineParam(
+        pipeline_name='fal-ai/flux-pro/v1.1-ultra',
+        generation_path=f'generation/flux_pro_v1_1_ultra',
+        base_resolution=1024,
+        generation_kwargs={
+            "enable_safety_checker": False,
+            "num_images": 1,
+            # "aspect_ratio": "1:1",
+            "output_format": "jpeg",
+            "safety_tolerance": 5,
+        }
+    )
+recraftv3_pipe = PipelineParam(
+        pipeline_name='fal-ai/recraft-v3',
+        generation_path=f'generation/recraftv3',
+        base_resolution=1024,
+        generation_kwargs={
+            "enable_safety_checker": False,
+            "num_images": 1,
+            # "aspect_ratio": "1:1",
+            "output_format": "jpeg",
+            "safety_tolerance": 5,
+        }
+    )

hpsv3/cohp/utils_cohp/utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+try:
+    import fal_client
+except:
+    fal_client = None
+try:
+    from diffusers import AutoPipelineForText2Image, DiffusionPipeline
+except:
+    AutoPipelineForText2Image = None
+    DiffusionPipeline = None
+import json
+import diffusers
+from functools import partial
+import os
+# export FAL_KEY="YOUR_API_KEY"
+os.environ['FAL_KEY'] = 'YOUR_API_KEY'
+def init_pipelines(pipe_name, pipe_init_kwargs, device=None):
+    try:
+        pipeline = AutoPipelineForText2Image.from_pretrained(pipe_name, **pipe_init_kwargs).to(device)
+    except Exception as e:
+        # try:
+            config = json.load(open(os.path.join(pipe_name, 'model_index.json')))
+            class_name_str = config['_class_name']
+            pipeline_class = getattr(diffusers, class_name_str)
+            pipeline = pipeline_class.from_pretrained(pipe_name, **pipe_init_kwargs).to(device)
+    return pipeline
+def init_pipeline_from_names(pipe_names, weight_dtype):
+    pipelines_dict = {}
+    for name in pipe_names:
+        pipeline = AutoPipelineForText2Image.from_pretrained(name, torch_dtype=weight_dtype)
+        pipelines_dict[name] = pipeline
+    return pipelines_dict
+def on_queue_update(update):
+    if isinstance(update, fal_client.InProgress):
+        for log in update.logs:
+           print(log["message"])
+def gen_with_api(pipe_names, generation_kwargs):
+    result = fal_client.subscribe(
+        pipe_names,
+        arguments=generation_kwargs,
+        with_logs=True,
+        on_queue_update=on_queue_update,
+    )
+    return result

hpsv3/config/HPSv3_7B.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+# Model Configuration
+rm_head_type: "ranknet"
+lora_enable: False
+vision_lora: False
+freeze_vision_tower: False
+freeze_llm: False
+tune_merger: True
+model_name_or_path: "Qwen/Qwen2-VL-7B-Instruct"
+num_lora_modules: -1
+lora_r: 512
+lora_alpha: 1024
+lora_namespan_exclude: ['lm_head', 'rm_head', 'embed_tokens']
+# Data Configuration
+confidence_threshold: 0.95
+tied_threshold: null
+max_pixels: 200704 # 256 * 28 * 28
+min_pixels: 200704
+with_instruction: true
+train_json_list:
+  - example_train.json
+test_json_list:
+  - ["Valid Set 1", ["example_set_1_part1.json", "example_set_1_part2.json"]]
+  - ['Valid Set 2',["example_set_2_part1.json"]]
+soft_label: False
+output_dir: output_models
+use_special_tokens: true
+reward_token: "special"
+output_dim: 2
+loss_type: "uncertainty"
+# Training Configuration
+disable_flash_attn2: False
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 8
+gradient_accumulation_steps: 4
+num_train_epochs: 10
+learning_rate: 2.0e-6
+special_token_lr: 2.0e-6
+warmup_ratio: 0.05
+lr_scheduler_type: "constant_with_warmup"
+gradient_checkpointing: True
+gradient_checkpointing_kwargs: {"use_reentrant": False}
+# Evaluation and Logging
+eval_strategy: "steps"
+logging_epochs: 0.01
+eval_epochs: 0.1
+save_epochs: 0.1
+report_to: tensorboard
+# System Configuration
+bf16: True
+torch_dtype: "bfloat16"
+deepspeed: hpsv3/config/ds_config/zero2.json
+save_only_model: True
+save_full_model: True
+dataloader_num_workers: 8

hpsv3/config/ds_config/zero0.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 0
+    }
+}

hpsv3/config/ds_config/zero2.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}

hpsv3/config/ds_config/zero3.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}

hpsv3/dataset/data_collator_qwen.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import pdb
+from dataclasses import dataclass, field
+from typing import Optional, List, Union
+import numpy as np
+import pandas as pd
+import torch
+from hpsv3.dataset.utils import process_vision_info
+from torch.utils.data import Dataset
+import torchvision.transforms.functional as F
+INSTRUCTION = """
+You are tasked with evaluating a generated image based on Visual Quality and Text Alignment and give a overall score to estimate the human preference. Please provide a rating from 0 to 10, with 0 being the worst and 10 being the best.
+**Visual Quality:**
+Evaluate the overall visual quality of the image. The following sub-dimensions should be considered:
+- **Reasonableness:** The image should not contain any significant biological or logical errors, such as abnormal body structures or nonsensical environmental setups.
+- **Clarity:** Evaluate the sharpness and visibility of the image. The image should be clear and easy to interpret, with no blurring or indistinct areas.
+- **Detail Richness:** Consider the level of detail in textures, materials, lighting, and other visual elements (e.g., hair, clothing, shadows).
+- **Aesthetic and Creativity:** Assess the artistic aspects of the image, including the color scheme, composition, atmosphere, depth of field, and the overall creative appeal. The scene should convey a sense of harmony and balance.
+- **Safety:** The image should not contain harmful or inappropriate content, such as political, violent, or adult material. If such content is present, the image quality and satisfaction score should be the lowest possible.
+**Text Alignment:**
+Assess how well the image matches the textual prompt across the following sub-dimensions:
+- **Subject Relevance** Evaluate how accurately the subject(s) in the image (e.g., person, animal, object) align with the textual description. The subject should match the description in terms of number, appearance, and behavior.
+- **Style Relevance:** If the prompt specifies a particular artistic or stylistic style, evaluate how well the image adheres to this style.
+- **Contextual Consistency**: Assess whether the background, setting, and surrounding elements in the image logically fit the scenario described in the prompt. The environment should support and enhance the subject without contradictions.
+- **Attribute Fidelity**: Check if specific attributes mentioned in the prompt (e.g., colors, clothing, accessories, expressions, actions) are faithfully represented in the image. Minor deviations may be acceptable, but critical attributes should be preserved.
+- **Semantic Coherence**: Evaluate whether the overall meaning and intent of the prompt are captured in the image. The generated content should not introduce elements that conflict with or distort the original description.
+Textual prompt - {text_prompt}
+"""
+INSTRUCTION_debug = """
+{text_prompt}
+"""
+prompt_with_special_token = """
+Please provide the overall ratings of this image: <|Reward|>
+END
+"""
+prompt_without_special_token = """
+Please provide the overall ratings of this image:
+"""
+class QWen2VLDataCollator:
+    def __init__(
+        self,
+        processor,
+        with_instruction=True,
+        max_pixels=256 * 28 * 28,  # Default max pixels
+        min_pixels=256 * 28 * 28,  # Default min pixels
+        use_special_tokens=True,
+    ):
+        self.processor = processor
+        self.with_instruction = with_instruction
+        self.max_pixels = max_pixels
+        self.min_pixels = min_pixels
+        self.use_special_tokens = use_special_tokens
+    def _clean_message(
+        self,
+        texts,
+        images,
+        max_pixels=256 * 28 * 28,
+        min_pixels=256 * 28 * 28,
+        with_instruction=True,
+        use_special_tokens=True,
+    ):
+        """
+        remove unnecessary keys from message(very very necessary)
+        """
+        message_list = []
+        for text, image in zip(texts, images):
+            out_message = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "image": image,
+                            "min_pixels": min_pixels,
+                            "max_pixels": max_pixels,
+                        },
+                        {
+                            "type": "text",
+                            "text": (
+                                INSTRUCTION.format(text_prompt=text)
+                                + prompt_with_special_token
+                                if use_special_tokens
+                                else prompt_without_special_token
+                            ),
+                        },
+                    ],
+                }
+            ]
+            message_list.append(out_message)
+        return message_list
+    def _pad_sequence(self, sequences, attention_mask, max_len, padding_side="right"):
+        """
+        Pad the sequences to the maximum length.
+        """
+        assert padding_side in ["right", "left"]
+        if sequences.shape[1] >= max_len:
+            return sequences, attention_mask
+        pad_len = max_len - sequences.shape[1]
+        padding = (0, pad_len) if padding_side == "right" else (pad_len, 0)
+        sequences_padded = torch.nn.functional.pad(
+            sequences, padding, "constant", self.processor.tokenizer.pad_token_id
+        )
+        attention_mask_padded = torch.nn.functional.pad(
+            attention_mask, padding, "constant", 0
+        )
+        return sequences_padded, attention_mask_padded
+    def __call__(self, inputs, with_instruction=True):
+        """
+        Preprocess inputs to token sequences and return a batch
+        """
+        images_1, images_2, texts_1, texts_2 = [], [], [], []
+        for idx, batch in enumerate(inputs):
+            texts_1.append(batch["text_1"])
+            texts_2.append(batch["text_2"])
+            images_1.append(batch["image_1"])
+            images_2.append(batch["image_2"])
+        messages_batch_1 = self._clean_message(
+            texts_1,
+            images_1,
+            max_pixels=self.max_pixels,
+            min_pixels=self.min_pixels,
+            with_instruction=self.with_instruction,
+            use_special_tokens=self.use_special_tokens,
+        )
+        messages_batch_2 = self._clean_message(
+            texts_2,
+            images_2,
+            max_pixels=self.max_pixels,
+            min_pixels=self.min_pixels,
+            with_instruction=self.with_instruction,
+            use_special_tokens=self.use_special_tokens,
+        )
+        # import pdb; pdb.set_trace()
+        image_inputs_1, _ = process_vision_info(messages_batch_1)
+        image_inputs_2, _ = process_vision_info(messages_batch_2)
+        image_inputs_1 = [
+            np.array(image_inputs_1[i]) / 255.0 for i in range(len(image_inputs_1))
+        ]
+        image_inputs_2 = [
+            np.array(image_inputs_2[i]) / 255.0 for i in range(len(image_inputs_2))
+        ]
+        do_rescale = False
+        batch_1 = self.processor(
+            text=self.processor.apply_chat_template(
+                messages_batch_1, tokenize=False, add_generation_prompt=True
+            ),
+            images=image_inputs_1,
+            videos=None,
+            padding=True,
+            return_tensors="pt",
+            images_kwargs={"do_rescale": do_rescale},
+        )
+        batch_2 = self.processor(
+            text=self.processor.apply_chat_template(
+                messages_batch_2, tokenize=False, add_generation_prompt=True
+            ),
+            images=image_inputs_2,
+            videos=None,
+            padding=True,
+            return_tensors="pt",
+            images_kwargs={"do_rescale": do_rescale},
+        )
+        # pdb.set_trace()
+        max_len = max(batch_1["input_ids"].shape[1], batch_2["input_ids"].shape[1])
+        batch_1["input_ids"], batch_1["attention_mask"] = self._pad_sequence(
+            batch_1["input_ids"], batch_1["attention_mask"], max_len, "right"
+        )
+        batch_2["input_ids"], batch_2["attention_mask"] = self._pad_sequence(
+            batch_2["input_ids"], batch_2["attention_mask"], max_len, "right"
+        )
+        batch = {
+            "batch_1": batch_1,
+            "batch_2": batch_2,
+            "choice_dist": torch.stack([batch["choice_dist"] for batch in inputs]),
+            # Store original text prompts for visualization
+            "text_1": texts_1,
+            "text_2": texts_2,
+            "image_1": image_inputs_1,
+            "image_2": image_inputs_2,
+        }
+        return batch

hpsv3/dataset/pairwise_dataset.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import random
+import json
+import os
+from tqdm import tqdm
+class PairwiseOriginalDataset(Dataset):
+    def __init__(
+        self,
+        json_list,
+        soft_label=False,
+        confidence_threshold=None,
+    ):
+        self.samples = []
+        for json_file in json_list:
+            with open(json_file, "r") as f:
+                data = json.load(f)
+            self.samples.extend(data)
+        self.soft_label = soft_label
+        self.confidence_threshold = confidence_threshold
+        if confidence_threshold is not None:
+            new_samples = []
+            for sample in tqdm(
+                self.samples, desc="Filtering samples according to confidence threshold"
+            ):
+                if sample.get("confidence", float("inf")) >= confidence_threshold:
+                    new_samples.append(sample)
+            self.samples = new_samples
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        while True:
+            index = idx
+            try:
+                return self.get_single_item(index)
+            except Exception as e:
+                print(f"Error processing sample at index {idx}: {e}")
+                import traceback
+                traceback.print_exc()
+                index = random.randint(0, len(self.samples) - 1)
+                if index == idx:
+                    continue
+                idx = index
+    def get_single_item(self, idx):
+        sample = self.samples[idx]
+        # Load image paths
+        image_1 = sample["path1"]
+        image_2 = sample["path2"]
+        assert os.path.exists(image_1) and os.path.exists(image_2), f'{image_1} or {image_2}'
+        text_1 = sample["prompt"]
+        text_2 = sample["prompt"]
+        # Process Label
+        if self.soft_label:
+            choice_dist = sorted(sample["choice_dist"], reverse=True)
+            assert (
+                torch.sum(torch.tensor(choice_dist)) > 0
+            ), "Choice distribution cannot be zero."
+            label = torch.tensor(choice_dist[0]) / torch.sum(torch.tensor(choice_dist))
+        else:
+            label = torch.tensor(1).float()
+        # breakpoint()
+        return {
+            "image_1": image_1,
+            "image_2": image_2,
+            "text_1": text_1,
+            "text_2": text_2,
+            "label": label,
+            "confidence": sample.get("confidence", 1.0),
+            "choice_dist": torch.tensor(sample.get("choice_dist", [1.0, 0.0])),
+        }

hpsv3/dataset/utils.py ADDED Viewed

	@@ -0,0 +1,426 @@

+from __future__ import annotations
+import numpy as np
+## This file is modified from https://github.com/kq-chen/qwen-vl-utils/blob/main/src/qwen_vl_utils/vision_process.py
+import base64
+import logging
+import math
+import os
+import sys
+import time
+import warnings
+from functools import lru_cache
+from io import BytesIO
+import requests
+import torch
+import torchvision
+from packaging import version
+from PIL import Image
+from torchvision import io, transforms
+from torchvision.transforms import InterpolationMode
+logger = logging.getLogger(__name__)
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 768
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif isinstance(image, torch.Tensor):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(requests.get(image, stream=True).raw)
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    if isinstance(image_obj, Image.Image):
+        image = image_obj.convert("RGB")
+    ## resize
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+        )
+    else:
+        if isinstance(image, torch.Tensor):
+            shape = image.shape
+            if len(shape) == 4:
+                if shape[1] in [1, 3]:  # Likely [B, C, H, W]
+                    height, width = shape[2], shape[3]
+                    image_mode = 'NCHW'
+                elif shape[3] in [1, 3]:  # Likely [B, H, W, C]
+                    height, width = shape[1], shape[2]
+                    image_mode = 'NHWC'
+            elif len(shape) == 3:
+                if shape[0] in [1, 3]:  # Likely [C, H, W]
+                    height, width = shape[1], shape[2]
+                    image_mode = 'CHW'
+                elif shape[2] in [1, 3]:  # Likely [H, W, C]
+                    height, width = shape[0], shape[1]
+                    image_mode = 'HWC'
+                else:
+                    raise ValueError(f"Cannot determine tensor image format from shape {shape}")
+            else:
+                raise ValueError(f"Unsupported tensor image shape: {shape}")
+        else:
+            width, height = image.size
+        min_pixels = ele.get("min_pixels", MIN_PIXELS)
+        max_pixels = ele.get("max_pixels", MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    if isinstance(image, torch.Tensor):
+        if image_mode == 'NCHW':
+            image = transforms.functional.resize(
+                image, [resized_height, resized_width], interpolation=InterpolationMode.BICUBIC, antialias=True
+            )
+        elif image_mode == 'NHWC':
+            image = transforms.functional.resize(
+                image.permute(0, 3, 1, 2), [resized_height, resized_width], interpolation=InterpolationMode.BICUBIC, antialias=True
+            )
+        elif image_mode == 'CHW':
+            image = image.unsqueeze(0)  # Add batch dimension
+            image = transforms.functional.resize(
+                image, [resized_height, resized_width], interpolation=InterpolationMode.BICUBIC, antialias=True
+            )
+        elif image_mode == 'HWC':
+            image = image.permute(2, 0, 1).unsqueeze(0)  # Add batch dimension and change to CHW
+            image = transforms.functional.resize(
+                image, [resized_height, resized_width], interpolation=InterpolationMode.BICUBIC, antialias=True
+            )
+    else:
+        # If the image is a PIL Image, we resize it using PIL.
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        image = image.resize((resized_width, resized_height), Image.BICUBIC)
+    return image
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    """calculate the number of frames for video used for model inputs.
+    Args:
+        ele (dict): a dict contains the configuration of video.
+            support either `fps` or `nframes`:
+                - nframes: the number of frames to extract for model inputs.
+                - fps: the fps to extract frames for model inputs.
+                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
+                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
+        total_frames (int): the original total number of frames of the video.
+        video_fps (int | float): the original fps of the video.
+    Raises:
+        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
+    Returns:
+        int: the number of frames for video used for model inputs.
+    """
+    assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
+        max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
+        nframes = total_frames / video_fps * fps
+        nframes = min(max(nframes, min_frames), max_frames)
+        nframes = round_by_factor(nframes, FRAME_FACTOR)
+    if nframes > total_frames:
+        nframes = total_frames
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
+    return nframes
+def _read_video_torchvision(
+    ele: dict,
+) -> torch.Tensor:
+    """read video using torchvision.io.read_video
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    video_path = ele["video"]
+    if version.parse(torchvision.__version__) < version.parse("0.19.0"):
+        if "http://" in video_path or "https://" in video_path:
+            warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
+        if "file://" in video_path:
+            video_path = video_path[7:]
+    st = time.time()
+    video, audio, info = io.read_video(
+        video_path,
+        start_pts=ele.get("video_start", 0.0),
+        end_pts=ele.get("video_end", None),
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+    total_frames, video_fps = video.size(0), info["video_fps"]
+    # logger.info(f"torchvision:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    if ele['sample_type'] == 'uniform':
+        nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+        idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    elif ele['sample_type'] == 'multi_pts':
+        frames_each_pts = 6
+        num_pts = 4
+        fps = 8
+        nframes = int(total_frames * fps // video_fps)
+        frames_idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+        start_pt = int(frames_each_pts // 2)
+        end_pt = int(nframes - frames_each_pts // 2 - 1)
+        pts = torch.linspace(start_pt, end_pt, num_pts).round().long().tolist()
+        idx = []
+        for pt in pts:
+            idx.extend(frames_idx[pt - frames_each_pts // 2 : pt + frames_each_pts // 2])
+    video = video[idx]
+    return video
+def is_decord_available() -> bool:
+    import importlib.util
+    return importlib.util.find_spec("decord") is not None
+def _read_video_decord(
+    ele: dict,
+) -> torch.Tensor:
+    """read video using decord.VideoReader
+    Args:
+        ele (dict): a dict contains the configuration of video.
+        support keys:
+            - video: the path of video. support "file://", "http://", "https://" and local path.
+            - video_start: the start time of video.
+            - video_end: the end time of video.
+    Returns:
+        torch.Tensor: the video tensor with shape (T, C, H, W).
+    """
+    import decord
+    video_path = ele["video"]
+    st = time.time()
+    vr = decord.VideoReader(video_path)
+    # TODO: support start_pts and end_pts
+    if 'video_start' in ele or 'video_end' in ele:
+        raise NotImplementedError("not support start_pts and end_pts in decord for now.")
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
+    # logger.info(f"decord:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    if ele['sample_type'] == 'uniform':
+        nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+        # nframes = max(nframes, 8)
+        # import pdb; pdb.set_trace()
+        idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    elif ele['sample_type'] == 'multi_pts':
+        frames_each_pts = 6
+        num_pts = 4
+        fps = 8
+        nframes = int(total_frames * fps // video_fps)
+        frames_idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+        start_pt = int(frames_each_pts // 2)
+        end_pt = int(nframes - frames_each_pts // 2 - 1)
+        pts = torch.linspace(start_pt, end_pt, num_pts).round().long().tolist()
+        idx = []
+        for pt in pts:
+            idx.extend(frames_idx[pt - frames_each_pts // 2 : pt + frames_each_pts // 2])
+    video = vr.get_batch(idx).asnumpy()
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    return video
+VIDEO_READER_BACKENDS = {
+    "decord": _read_video_decord,
+    "torchvision": _read_video_torchvision,
+}
+FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
+@lru_cache(maxsize=1)
+def get_video_reader_backend() -> str:
+    if FORCE_QWENVL_VIDEO_READER is not None:
+        video_reader_backend = FORCE_QWENVL_VIDEO_READER
+    elif is_decord_available():
+        video_reader_backend = "decord"
+    else:
+        video_reader_backend = "torchvision"
+    print(f"qwen-vl-utils using {video_reader_backend} to read video.", file=sys.stderr)
+    return video_reader_backend
+def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR) -> torch.Tensor | list[Image.Image]:
+    if isinstance(ele["video"], str):
+        video_reader_backend = get_video_reader_backend()
+        video = VIDEO_READER_BACKENDS[video_reader_backend](ele)
+        # import pdb; pdb.set_trace()
+        nframes, _, height, width = video.shape
+        min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+        total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
+        max_pixels = ele.get("max_pixels", max_pixels)
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=image_factor,
+            )
+        else:
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=image_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+        video = transforms.functional.resize(
+            video,
+            [resized_height, resized_width],
+            interpolation=InterpolationMode.BICUBIC,
+            antialias=True,
+        ).float()
+        return video
+    else:
+        assert isinstance(ele["video"], (list, tuple))
+        process_info = ele.copy()
+        process_info.pop("type", None)
+        process_info.pop("video", None)
+        images = [
+            fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
+            for video_element in ele["video"]
+        ]
+        nframes = ceil_by_factor(len(images), FRAME_FACTOR)
+        if len(images) < nframes:
+            images.extend([images[-1]] * (nframes - len(images)))
+        return images
+def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
+    vision_infos = []
+    if isinstance(conversations[0], dict):
+        conversations = [conversations]
+    for conversation in conversations:
+        for message in conversation:
+            if isinstance(message["content"], list):
+                for ele in message["content"]:
+                    if (
+                        "image" in ele
+                        or "image_url" in ele
+                        or "video" in ele
+                        or ele["type"] in ("image", "image_url", "video")
+                    ):
+                        vision_infos.append(ele)
+    return vision_infos
+def process_vision_info(
+    conversations: list[dict] | list[list[dict]],
+) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None]:
+    vision_infos = extract_vision_info(conversations)
+    ## Read images or videos
+    image_inputs = []
+    video_inputs = []
+    for vision_info in vision_infos:
+        if "image" in vision_info or "image_url" in vision_info:
+            image_inputs.append(fetch_image(vision_info))
+        elif "video" in vision_info:
+            video_inputs.append(fetch_video(vision_info))
+        else:
+            raise ValueError("image, image_url or video should in content.")
+    if len(image_inputs) == 0:
+        image_inputs = None
+    if len(video_inputs) == 0:
+        video_inputs = None
+    return image_inputs, video_inputs

hpsv3/inference.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+from collections.abc import Mapping
+import torch
+import huggingface_hub
+from .dataset.utils import process_vision_info
+from .dataset.data_collator_qwen import prompt_with_special_token, prompt_without_special_token, INSTRUCTION
+from .utils.parser import ModelConfig, PEFTLoraConfig, TrainingConfig, DataConfig, parse_args_with_yaml
+from .train import create_model_and_processor
+from pathlib import Path
+_MODEL_CONFIG_PATH = Path(__file__).parent / f"config/"
+class HPSv3RewardInferencer():
+    def __init__(self, config_path=None, checkpoint_path=None, device='cuda', differentiable=False):
+        if config_path is None:
+                config_path = os.path.join(_MODEL_CONFIG_PATH, 'HPSv3_7B.yaml')
+        if checkpoint_path is None:
+            checkpoint_path = huggingface_hub.hf_hub_download("MizzenAI/HPSv3", 'HPSv3.safetensors', repo_type='model')
+        (data_config, training_args, model_config, peft_lora_config), config_path = (
+            parse_args_with_yaml(
+                (DataConfig, TrainingConfig, ModelConfig, PEFTLoraConfig), config_path, is_train=False
+            )
+        )
+        training_args.output_dir = os.path.join(
+            training_args.output_dir, config_path.split("/")[-1].split(".")[0]
+        )
+        model, processor, peft_config = create_model_and_processor(
+            model_config=model_config,
+            peft_lora_config=peft_lora_config,
+            training_args=training_args,
+            differentiable=differentiable,
+        )
+        self.device = device
+        self.use_special_tokens = model_config.use_special_tokens
+        if checkpoint_path.endswith('.safetensors'):
+            import safetensors.torch
+            state_dict = safetensors.torch.load_file(checkpoint_path, device="cpu")
+        else:
+            state_dict = torch.load(checkpoint_path , map_location="cpu")
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
+        model.load_state_dict(state_dict, strict=True)
+        model.eval()
+        self.model = model
+        self.processor = processor
+        self.model.to(self.device)
+        self.data_config = data_config
+    def _pad_sequence(self, sequences, attention_mask, max_len, padding_side='right'):
+        """
+        Pad the sequences to the maximum length.
+        """
+        assert padding_side in ['right', 'left']
+        if sequences.shape[1] >= max_len:
+            return sequences, attention_mask
+        pad_len = max_len - sequences.shape[1]
+        padding = (0, pad_len) if padding_side == 'right' else (pad_len, 0)
+        sequences_padded = torch.nn.functional.pad(sequences, padding, 'constant', self.processor.tokenizer.pad_token_id)
+        attention_mask_padded = torch.nn.functional.pad(attention_mask, padding, 'constant', 0)
+        return sequences_padded, attention_mask_padded
+    def _prepare_input(self, data):
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        if isinstance(data, Mapping):
+            return type(data)({k: self._prepare_input(v) for k, v in data.items()})
+        elif isinstance(data, (tuple, list)):
+            return type(data)(self._prepare_input(v) for v in data)
+        elif isinstance(data, torch.Tensor):
+            kwargs = {"device": self.device}
+            return data.to(**kwargs)
+        return data
+    def _prepare_inputs(self, inputs):
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        inputs = self._prepare_input(inputs)
+        if len(inputs) == 0:
+            raise ValueError
+        return inputs
+    def prepare_batch(self, image_paths, prompts):
+        max_pixels = 256 * 28 * 28
+        min_pixels = 256 * 28 * 28
+        message_list = []
+        for text, image in zip(prompts, image_paths):
+            out_message = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "image": image,
+                            "min_pixels": max_pixels,
+                            "max_pixels": max_pixels,
+                        },
+                        {
+                            "type": "text",
+                            "text": (
+                                INSTRUCTION.format(text_prompt=text)
+                                + prompt_with_special_token
+                                if self.use_special_tokens
+                                else prompt_without_special_token
+                            ),
+                        },
+                    ],
+                }
+            ]
+            message_list.append(out_message)
+        image_inputs, _ = process_vision_info(message_list)
+        batch = self.processor(
+            text=self.processor.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True),
+            images=image_inputs,
+            padding=True,
+            return_tensors="pt",
+            videos_kwargs={"do_rescale": True},
+        )
+        batch = self._prepare_inputs(batch)
+        return batch
+    def reward(self, image_paths, prompts):
+        batch = self.prepare_batch(image_paths, prompts)
+        rewards = self.model(
+            return_dict=True,
+            **batch
+        )["logits"]
+        return rewards
+if __name__ == "__main__":
+    config_path = 'config/inference/HPSv3_7B.yaml'
+    checkpoint_path = 'checkpoints/HPSv3_7B.pth'
+    device = 'cuda'
+    dtype = torch.bfloat16
+    inferencer = HPSv3RewardInferencer(config_path, checkpoint_path, device=device)
+    image_paths = [
+        "assets/example1.png",
+        "assets/example2.png"
+    ]
+    prompts = [
+        "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker",
+        "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker"
+    ]
+    rewards = inferencer.reward(image_paths, prompts)
+    print(rewards[0][0].item()) # miu and sigma. we select miu as the final output
+    print(rewards[1][0].item())

hpsv3/model/differentiable_image_processor.py ADDED Viewed

	@@ -0,0 +1,629 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Qwen2-VL.
+This module provides both differentiable and non-differentiable image processing methods:
+1. For DIFFERENTIABLE processing (torch.autograd compatible):
+   - Pass torch.Tensor to _preprocess() method
+   - Use preprocess_tensor() method directly
+   - All operations use PyTorch functions (F.interpolate, tensor operations, etc.)
+2. For NON-DIFFERENTIABLE processing (original functionality):
+   - Pass PIL images or numpy arrays to preprocess() method
+   - Uses PIL/transformers image processing functions and numpy operations
+The differentiable path supports:
+- Bilinear interpolation for resizing (instead of PIL resampling)
+- Tensor-based rescaling and normalization
+- Differentiable patch extraction and reshaping
+"""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+    raise ValueError(f"Could not make batched video from {videos}")
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if height < factor or width < factor:
+        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class Qwen2VLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+    def _preprocess_differentiable(
+        self,
+        images: torch.Tensor,
+        do_resize: bool = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+    ):
+        """
+        Differentiable version of image preprocessing using torch operations.
+        Args:
+            images: torch.Tensor of shape (B, C, H, W) or (C, H, W)
+        Returns:
+            flatten_patches: torch.Tensor - flattened patches
+            grid_thw: tuple - (grid_t, grid_h, grid_w)
+        """
+        if images.dim() == 3:
+            images = images.unsqueeze(0)  # Add batch dimension
+        batch_size, channels, height, width = images.shape
+        processed_images = []
+        resized_height, resized_width = height, width
+        for i in range(batch_size):
+            image = images[i]  # (C, H, W)
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                # Use differentiable interpolation
+                image = F.interpolate(
+                    image.unsqueeze(0),
+                    size=(resized_height, resized_width),
+                    mode='bilinear',
+                    align_corners=False
+                ).squeeze(0)
+            if do_rescale:
+                image = image * rescale_factor
+            if do_normalize:
+                if isinstance(image_mean, (list, tuple)):
+                    mean = torch.tensor(image_mean, device=image.device, dtype=image.dtype).view(-1, 1, 1)
+                    std = torch.tensor(image_std, device=image.device, dtype=image.dtype).view(-1, 1, 1)
+                else:
+                    mean = image_mean
+                    std = image_std
+                image = (image - mean) / std
+            processed_images.append(image)
+        # Stack all processed images
+        patches = torch.stack(processed_images)  # (B, C, H, W)
+        # Handle temporal dimension
+        if patches.shape[0] == 1:
+            patches = patches.repeat(self.temporal_patch_size, 1, 1, 1)
+        # Reshape for patch extraction
+        batch_size, channel, resized_height, resized_width = patches.shape
+        grid_t = batch_size // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        # Differentiable patch extraction and reshaping
+        patches = patches.view(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            grid_w // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+        )
+        patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        # Check if input is already a torch tensor (differentiable path)
+        if isinstance(images, torch.Tensor):
+            return self._preprocess_differentiable(
+                images,
+                do_resize=do_resize,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+            )
+        # Original non-differentiable path for backward compatibility
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        # NOTE: The following operations use numpy and are NOT differentiable
+        # For differentiable operations, pass torch.Tensor as input to use _preprocess_differentiable
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            grid_w // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess_tensor(
+        self,
+        images: torch.Tensor,
+        do_resize: bool = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+    ):
+        """
+        Differentiable preprocessing method for torch tensors.
+        Args:
+            images: torch.Tensor of shape (B, C, H, W) or (C, H, W)
+        Returns:
+            dict containing:
+                - pixel_values: torch.Tensor - processed patches
+                - image_grid_thw: torch.Tensor - grid dimensions
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        patches, image_grid_thw = self._preprocess_differentiable(
+            images,
+            do_resize=do_resize,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+        )
+        return {
+            "pixel_values": patches,
+            "image_grid_thw": torch.tensor(image_grid_thw, device=patches.device)
+        }
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            if not isinstance(pixel_values[0], torch.Tensor):
+                pixel_values = np.array(pixel_values)
+            else:
+                pixel_values = torch.stack(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+        return BatchFeature(data=data, tensor_type=return_tensors)

hpsv3/model/qwen2vl_trainer.py ADDED Viewed

	@@ -0,0 +1,971 @@

+import os
+import pdb
+import warnings
+import time
+import math
+import json
+from PIL import Image
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+from typing import List, Optional, Dict, Union, Any
+import pandas as pd
+import safetensors
+import numpy as np
+import torch
+import torch.nn as nn
+import datasets
+from torch.utils.data import Dataset, DataLoader
+from peft import PeftModel
+from transformers import Qwen2VLForConditionalGeneration
+from transformers import AutoConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.trainer import TrainerCallback
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    is_peft_available,
+    is_datasets_available,
+    WEIGHTS_NAME,
+    TRAINING_ARGS_NAME,
+    SAFE_WEIGHTS_NAME,
+    TRAINER_STATE_NAME,
+    PREFIX_CHECKPOINT_DIR,
+    logger,
+    speed_metrics,
+    deepspeed_init,
+    speed_metrics,
+    has_length,
+    EvalPrediction,
+    EvalLoopContainer,
+    PredictionOutput,
+    is_torch_xla_available,
+    denumpify_detensorize,
+    PredictionOutput,
+    EvalLoopOutput,
+    DistributedTensorGatherer,
+    SequentialDistributedSampler,
+    nested_concat,
+)
+from transformers.trainer_pt_utils import IterableDatasetShard
+from transformers.trainer_callback import TrainerControl, TrainerState
+from transformers.trainer_pt_utils import nested_detach, find_batch_size
+from transformers.training_args import TrainingArguments
+from trl import RewardTrainer
+from hpsv3.utils.training_utils import get_peft_state_non_lora_maybe_zero_3
+class Qwen2VLRewardModelBT(Qwen2VLForConditionalGeneration):
+    def __init__(
+        self,
+        config,
+        output_dim=4,
+        reward_token="last",
+        special_token_ids=None,
+        rm_head_type="default",
+        rm_head_kwargs=None,
+    ):
+        super().__init__(config)
+        # pdb.set_trace()
+        self.output_dim = output_dim
+        if rm_head_type == "default":
+            self.rm_head = nn.Linear(config.hidden_size, output_dim, bias=False)
+        elif rm_head_type == "ranknet":
+            if rm_head_kwargs is not None:
+                for layer in range(rm_head_kwargs.get("num_layers", 3)):
+                    if layer == 0:
+                        self.rm_head = nn.Sequential(
+                            nn.Linear(config.hidden_size, rm_head_kwargs["hidden_size"]),
+                            nn.ReLU(),
+                            nn.Dropout(rm_head_kwargs.get("dropout", 0.1)),
+                        )
+                    elif layer < rm_head_kwargs.get("num_layers", 3) - 1:
+                        self.rm_head.add_module(
+                            f"layer_{layer}",
+                            nn.Sequential(
+                                nn.Linear(rm_head_kwargs["hidden_size"], rm_head_kwargs["hidden_size"]),
+                                nn.ReLU(),
+                                nn.Dropout(rm_head_kwargs.get("dropout", 0.1)),
+                            ),
+                        )
+                    else:
+                        self.rm_head.add_module(
+                            f"output_layer",
+                            nn.Linear(rm_head_kwargs["hidden_size"], output_dim, bias=rm_head_kwargs.get("bias", False)),
+                        )
+            else:
+                self.rm_head = nn.Sequential(
+                    nn.Linear(config.hidden_size, 1024),
+                    nn.ReLU(),
+                    nn.Dropout(0.05),
+                    nn.Linear(1024, 16),
+                    nn.ReLU(),
+                    nn.Linear(16, output_dim),
+                )
+        self.rm_head.to(torch.float32)
+        self.reward_token = reward_token
+        self.special_token_ids = special_token_ids
+        if self.special_token_ids is not None:
+            self.reward_token = "special"
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+    ):
+        ## modified from the origin class Qwen2VLForConditionalGeneration
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # pdb.set_trace()
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.visual.get_dtype())
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                image_mask = (
+                    (input_ids == self.config.image_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                )
+                image_embeds = image_embeds.to(
+                    inputs_embeds.device, inputs_embeds.dtype
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            if pixel_values_videos is not None:
+                pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                video_mask = (
+                    (input_ids == self.config.video_token_id)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                )
+                video_embeds = video_embeds.to(
+                    inputs_embeds.device, inputs_embeds.dtype
+                )
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]  # [B, L, D]
+        with torch.autocast(device_type='cuda', dtype=torch.float32):
+            logits = self.rm_head(hidden_states)  # [B, L, N]
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        ## get sequence length
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError(
+                "Cannot handle batch sizes > 1 if no padding token is defined."
+            )
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = (
+                    torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                )
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        ## get the last token's logits
+        if self.reward_token == "last":
+            pooled_logits = logits[
+                torch.arange(batch_size, device=logits.device), sequence_lengths
+            ]
+        elif self.reward_token == "mean":
+            ## get the mean of all valid tokens' logits
+            valid_lengths = torch.clamp(sequence_lengths, min=0, max=logits.size(1) - 1)
+            pooled_logits = torch.stack(
+                [logits[i, : valid_lengths[i]].mean(dim=0) for i in range(batch_size)]
+            )
+        elif self.reward_token == "special":
+            # special_token_ids = self.tokenizer.convert_tokens_to_ids(self.special_tokens)
+            # create a mask for special tokens
+            special_token_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+            for special_token_id in self.special_token_ids:
+                special_token_mask = special_token_mask | (
+                    input_ids == special_token_id
+                )
+            pooled_logits = logits[special_token_mask, ...]
+            pooled_logits = pooled_logits.view(
+                batch_size, 1, -1
+            )  # [B, 3, N] assert 3 attributes
+            pooled_logits = pooled_logits.view(batch_size, -1)
+            # pdb.set_trace()
+        else:
+            raise ValueError("Invalid reward_token")
+        return {"logits": pooled_logits}
+def _convert_A_B_to_chosen_rejected(
+    rewards_A,
+    rewards_B,
+    tied_threshold=None,
+    choice_dist=None,
+):
+    """
+    Inputs:
+        rewards_A: [B, 1]
+        rewards_B: [B, 1]
+    Outputs:
+        rewards_chosen: [B, 1]
+        rewards_rejected: [B, 1]
+        nontied_mask: [B, 1] (preference labels that is not tied)
+    """
+    chosen_label = torch.ones_like(rewards_A, dtype=torch.int64).to(
+        rewards_A.device
+    )  # [B, 1]
+    chosen_mask = chosen_label == 1
+    rejected_mask = chosen_label != 1
+    rewards_chosen = rewards_A
+    rewards_rejected = rewards_B
+    if tied_threshold is None:
+        nontied_mask = torch.ones_like(chosen_label, dtype=torch.float32).to(
+            rewards_A.device
+        )
+    else:
+        nontied_mask = (
+            torch.abs(
+                (choice_dist[:, 0] - choice_dist[:, 1]) / torch.sum(choice_dist, dim=-1)
+            )
+            > tied_threshold
+        )
+        print(nontied_mask)
+    return (
+        rewards_chosen,
+        rewards_rejected,
+        nontied_mask,
+    )
+class PartialEmbeddingUpdateCallback(TrainerCallback):
+    """
+    Callback to update the embedding of special tokens
+    Only the special tokens are updated, the rest of the embeddings are kept fixed
+    """
+    def __init__(self, special_token_ids):
+        super().__init__()
+        self.special_token_ids = special_token_ids
+        self.orig_embeds_params = None
+    def on_train_begin(self, args, state, control, **kwargs):
+        model = kwargs.get("model")
+        self.orig_embeds_params = model.get_input_embeddings().weight.clone().detach()
+    def on_step_end(self, args, state, control, **kwargs):
+        # pdb.set_trace()
+        model = kwargs.get("model")
+        tokenizer = kwargs.get("tokenizer")
+        index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool)
+        index_no_updates[self.special_token_ids] = False
+        with torch.no_grad():
+            model.get_input_embeddings().weight[index_no_updates] = (
+                self.orig_embeds_params[index_no_updates]
+            )
+class VLMRewardTrainer(RewardTrainer):
+    def __init__(self, loss_type="regular", loss_hyperparameters={}, tied_threshold=None,
+                 visualization_steps=500, max_viz_samples=4, *args, **kwargs):
+        super(VLMRewardTrainer, self).__init__(*args, **kwargs)
+        self.loss_type = loss_type
+        self.tied_threshold = tied_threshold
+        self.rewards_chosen_accumulated = []
+        self.rewards_rejected_accumulated = []
+        self.loss_hyperparameters = loss_hyperparameters
+        self.visualization_steps = visualization_steps
+        self.max_viz_samples = max_viz_samples
+    def get_eval_dataloader(
+        self, eval_dataset: Optional[Union[str, Dataset]] = None
+    ) -> DataLoader:
+        """
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
+        Subclass and override this method if you want to inject some custom behavior.
+        Args:
+            eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*):
+                If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed.
+        """
+        if eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Trainer: evaluation requires an eval_dataset.")
+        # If we have persistent workers, don't do a fork bomb especially as eval datasets
+        # don't change during training
+        dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval"
+        if (
+            hasattr(self, "_eval_dataloaders")
+            and dataloader_key in self._eval_dataloaders
+            and self.args.dataloader_persistent_workers
+        ):
+            return self.accelerator.prepare(self._eval_dataloaders[dataloader_key])
+        eval_dataset = (
+            self.eval_dataset[eval_dataset]
+            if isinstance(eval_dataset, str)
+            else eval_dataset if eval_dataset is not None else self.eval_dataset
+        )
+        data_collator = self.data_collator
+        if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset):
+            eval_dataset = self._remove_unused_columns(
+                eval_dataset, description="evaluation"
+            )
+        else:
+            data_collator = self._get_collator_with_removed_columns(
+                data_collator, description="evaluation"
+            )
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
+        }
+        if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
+        # accelerator.free_memory() will destroy the references, so
+        # we need to store the non-prepared version
+        eval_dataloader = DataLoader(eval_dataset, **dataloader_params)
+        if self.args.dataloader_persistent_workers:
+            if hasattr(self, "_eval_dataloaders"):
+                self._eval_dataloaders[dataloader_key] = eval_dataloader
+            else:
+                self._eval_dataloaders = {dataloader_key: eval_dataloader}
+        return self.accelerator.prepare(eval_dataloader)
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+        opt_model = self.model
+        if self.optimizer is None:
+            decay_parameters = self.get_decay_parameter_names(opt_model)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            lr_mapper = {}
+            visual_parameters = []
+            merger_parameters = []
+            rm_head_parameters = []
+            if self.args.vision_lr is not None:
+                lr_mapper["visual"] = self.args.vision_lr
+                visual_parameters = [
+                    name
+                    for name, _ in opt_model.named_parameters()
+                    if "visual" in name and "merger" not in name
+                ]
+            if self.args.merger_lr is not None:
+                lr_mapper["merger"] = self.args.merger_lr
+                merger_parameters = [
+                    name for name, _ in opt_model.named_parameters() if "merger" in name
+                ]
+            if self.args.rm_head_lr is not None:
+                lr_mapper["rm_head"] = self.args.rm_head_lr
+                rm_head_parameters = [
+                    name for name, _ in opt_model.named_parameters() if "rm_head" in name
+                ]
+            if len(lr_mapper) > 0:
+                special_lr_parameters = merger_parameters + visual_parameters + rm_head_parameters
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n in decay_parameters
+                                and n not in special_lr_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n not in decay_parameters
+                                and n not in special_lr_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+                if visual_parameters:
+                    optimizer_grouped_parameters.extend(
+                        [
+                            {
+                                "params": [
+                                    p
+                                    for n, p in opt_model.named_parameters()
+                                    if (
+                                        n in decay_parameters
+                                        and n in visual_parameters
+                                        and p.requires_grad
+                                    )
+                                ],
+                                "weight_decay": self.args.weight_decay,
+                                "lr": self.args.vision_lr,
+                            },
+                            {
+                                "params": [
+                                    p
+                                    for n, p in opt_model.named_parameters()
+                                    if (
+                                        n not in decay_parameters
+                                        and n in visual_parameters
+                                        and p.requires_grad
+                                    )
+                                ],
+                                "weight_decay": 0.0,
+                                "lr": self.args.vision_lr,
+                            },
+                        ]
+                    )
+                if merger_parameters:
+                    optimizer_grouped_parameters.extend(
+                        [
+                            {
+                                "params": [
+                                    p
+                                    for n, p in opt_model.named_parameters()
+                                    if (
+                                        n in decay_parameters
+                                        and n in merger_parameters
+                                        and p.requires_grad
+                                    )
+                                ],
+                                "weight_decay": self.args.weight_decay,
+                                "lr": self.args.merger_lr,
+                            },
+                            {
+                                "params": [
+                                    p
+                                    for n, p in opt_model.named_parameters()
+                                    if (
+                                        n not in decay_parameters
+                                        and n in merger_parameters
+                                        and p.requires_grad
+                                    )
+                                ],
+                                "weight_decay": 0.0,
+                                "lr": self.args.merger_lr,
+                            },
+                        ]
+                    )
+                if rm_head_parameters:
+                    optimizer_grouped_parameters.extend(
+                        [
+                            {
+                                "params": [
+                                    p
+                                    for n, p in opt_model.named_parameters()
+                                    if (
+                                        n in decay_parameters
+                                        and n in rm_head_parameters
+                                        and p.requires_grad
+                                    )
+                                ],
+                                "weight_decay": self.args.weight_decay,
+                                "lr": self.args.rm_head_lr,
+                            },
+                            {
+                                "params": [
+                                    p
+                                    for n, p in opt_model.named_parameters()
+                                    if (
+                                        n not in decay_parameters
+                                        and n in rm_head_parameters
+                                        and p.requires_grad
+                                    )
+                                ],
+                                "weight_decay": 0.0,
+                                "lr": self.args.rm_head_lr,
+                            },
+                        ]
+                    )
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+            if self.model.special_token_ids:
+                special_token_embeddings = opt_model.get_input_embeddings().weight
+                special_token_embeddings.requires_grad = True
+                optimizer_grouped_parameters.extend(
+                    [
+                        {
+                            # "params": [p for n, p in opt_model.get_input_embeddings().named_parameters() if (p.requires_grad)],
+                            "params": [special_token_embeddings],
+                            "lr": self.args.special_token_lr,
+                            "weight_decay": 0.0,
+                        },
+                    ]
+                )
+            optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(
+                self.args, opt_model
+            )
+            self.optimizer = optimizer_cls(
+                optimizer_grouped_parameters, **optimizer_kwargs
+            )
+        return self.optimizer
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+        rewards_A = model(return_dict=True, **inputs["batch_1"])["logits"]
+        rewards_B = model(return_dict=True, **inputs["batch_2"])["logits"]
+        # Log to TensorBoard for visualization
+        if (hasattr(self.state, 'global_step') and
+            self.state.global_step % self.visualization_steps == 0 and
+            self.state.global_step > 0):
+            # Pass the original inputs which should contain the text prompts
+            self._log_training_visualization(inputs, rewards_A, rewards_B)
+        # calculate loss, optionally modulate with margin
+        # get chosen and rejected rewards from the chosen label
+        (
+            rewards_chosen,
+            rewards_rejected,
+            nontied_mask,
+        ) = _convert_A_B_to_chosen_rejected(
+            rewards_A,
+            rewards_B,
+            tied_threshold=self.tied_threshold,
+            choice_dist=inputs["choice_dist"],
+        )
+        loss_dict = {}
+        if self.loss_type == "bt":
+            # Bradley-Terry model
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected)
+            out_mask = nontied_mask
+            loss = loss * out_mask
+            loss = loss.mean()
+        elif self.loss_type == "likelihood_displacement":
+            # Bradley-Terry model
+            loss = -nn.functional.logsigmoid(rewards_chosen - self.loss_hyperparameters['tau'] * rewards_rejected)
+            out_mask = nontied_mask
+            loss = loss * out_mask
+            loss = loss.mean()
+        elif self.loss_type == "constant_margin":
+            # Bradley-Terry model with constant margin
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - 0.57)
+            out_mask = nontied_mask
+            loss = loss * out_mask
+            loss = loss.mean()
+        elif self.loss_type == "btt":
+            # Bradley-Terry-With-Ties model
+            k = 5.0
+            log_k = math.log(k)
+            log_k2_sub_1 = math.log(k**2 - 1)
+            bt_loss = -nn.functional.logsigmoid(
+                rewards_chosen - rewards_rejected - log_k
+            )
+            same_loss = (
+                -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - log_k)
+                - nn.functional.logsigmoid(rewards_rejected - rewards_chosen - log_k)
+                - log_k2_sub_1
+            )
+            loss = bt_loss * nontied_mask.float() + same_loss * (
+                1 - nontied_mask.float()
+            )
+            out_mask = torch.ones_like(nontied_mask, dtype=torch.float32).to(
+                rewards_A.device
+            )  # [B, 1]
+            loss = loss * out_mask
+            loss = loss.mean()
+        elif self.loss_type == "hpsv2":
+            device = rewards_A.device
+            rewards = torch.nn.functional.softmax(
+                torch.cat([rewards_A, rewards_B], dim=-1), dim=-1
+            )
+            text_0_logits, text_1_logits = rewards[:, 0], rewards[:, 1]
+            label_0, label_1 = torch.ones_like(text_0_logits), torch.zeros_like(
+                text_0_logits
+            )
+            text_logits = torch.stack([text_0_logits, text_1_logits], dim=-1)
+            text_0_labels = torch.zeros(
+                text_logits.shape[0], device=device, dtype=torch.long
+            )
+            text_1_labels = text_0_labels + 1
+            text_0_loss = torch.nn.functional.cross_entropy(
+                text_logits, text_0_labels, reduction="none"
+            )
+            text_1_loss = torch.nn.functional.cross_entropy(
+                text_logits, text_1_labels, reduction="none"
+            )
+            loss = label_0 * text_0_loss + label_1 * text_1_loss
+            # absolute_example_weight = 1 / num_per_prompt
+            # denominator = absolute_example_weight.sum()
+            # weight_per_example = absolute_example_weight / denominator
+            # text_loss *= weight_per_example
+            loss = loss.sum()
+        elif self.loss_type == "uncertainty":
+            batch_size = rewards_A.shape[0]
+            mean_chosen = rewards_A[:, 0]
+            mean_rejected = rewards_B[:, 0]
+            sigma_chosen = torch.exp(rewards_A[:, 1])
+            sigma_rejected = torch.exp(rewards_B[:, 1])
+            mean_z = mean_chosen - mean_rejected
+            sigma_z = torch.sqrt(sigma_chosen**2 + sigma_rejected**2)
+            z_samples = torch.randn(batch_size, 1000).to(sigma_z.device).to(
+                torch.float16
+            ) * sigma_z.unsqueeze(1).repeat(1, 1000) + mean_z.unsqueeze(1).repeat(
+                1, 1000
+            )
+            loss = -torch.nn.functional.logsigmoid(z_samples).mean()
+        else:
+            raise NotImplementedError(f"Loss type {self.loss_type} not implemented.")
+        loss_dict.update({"loss": loss.item()})
+        if return_outputs:
+            ## return rewards_A/B instead of chosen/rejected
+            ## easier to calculate metrics for multi-attribute
+            return loss, {
+                "rewards_A": rewards_A,
+                "rewards_B": rewards_B,
+            }
+        return loss
+    def prediction_step(
+        self,
+        model,
+        inputs,
+        prediction_loss_only,
+        ignore_keys=None,
+    ):
+        model.eval()
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(
+                    self.model.config, "keys_to_ignore_at_inference", []
+                )
+            else:
+                ignore_keys = []
+        with torch.no_grad():
+            loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True)
+        if prediction_loss_only:
+            return (loss, None, None)
+        loss = loss.detach()
+        logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys)
+        logits = nested_detach(logits)
+        if self.loss_type != "uncertainty":
+            logits = torch.cat(logits, dim=1)  # [B, 2]
+        else:
+            logits = torch.cat([p[:, [0]] for p in logits], dim=1)
+        labels = torch.ones((logits.shape[0], 1)).to(logits.device)
+        return loss, logits, labels
+    def _log_training_visualization(self, inputs, rewards_A, rewards_B):
+        """Log training samples and predictions to TensorBoard"""
+        try:
+            # Get tensorboard writer from trainer
+            writer = None
+            if hasattr(self, 'log_metrics'):
+                # Try to get the writer from the logger
+                if hasattr(self.args, 'report_to') and 'tensorboard' in self.args.report_to:
+                    from torch.utils.tensorboard import SummaryWriter
+                    if not hasattr(self, '_tb_writer'):
+                        self._tb_writer = SummaryWriter(log_dir=self.args.logging_dir)
+                    writer = self._tb_writer
+            if writer is None:
+                return
+            step = self.state.global_step
+            batch_size = min(len(rewards_A), self.max_viz_samples)
+            # Log scalar metrics
+            for i in range(batch_size):
+                score_A = rewards_A[i].float().detach().cpu().numpy()
+                score_B = rewards_B[i].float().detach().cpu().numpy()
+                # Convert to float for logging
+                score_A_val = float(score_A.mean()) if score_A.ndim > 0 else float(score_A)
+                score_B_val = float(score_B.mean()) if score_B.ndim > 0 else float(score_B)
+                score_diff = score_A_val - score_B_val
+                writer.add_scalar(f'train_viz/sample_{i}/score_A', score_A_val, step)
+                writer.add_scalar(f'train_viz/sample_{i}/score_B', score_B_val, step)
+                writer.add_scalar(f'train_viz/sample_{i}/score_diff', score_diff, step)
+                try:
+                    # Get image data from inputs
+                    image_A = inputs['image_1'][i] if 'image_1' in inputs else None
+                    image_B = inputs['image_2'][i] if 'image_2' in inputs else None
+                    # Get prompt text from the original batch (now properly stored)
+                    prompt_A = inputs.get('text_1', ['Unknown prompt'])[i] if 'text_1' in inputs else 'Unknown prompt'
+                    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))
+                    fig.text(0.05, 0.05, f'Prompt:\n{prompt_A[:200]}{"..." if len(prompt_A) > 200 else ""}',
+                                 ha='left', va='bottom', fontsize=8, wrap=True,
+                                 bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue", alpha=0.7))
+                    img_A_np = np.array(image_A)
+                    if img_A_np.ndim == 3 and img_A_np.shape[0] == 3:  # CHW format
+                        img_A_np = np.transpose(img_A_np, (1, 2, 0))
+                    img_A_np = np.clip(img_A_np, 0, 1)  # Ensure values are in [0,1]
+                    axes[0].imshow(img_A_np)
+                    axes[0].set_title(f'Image A - Score: {score_A_val:.3f}')
+                    axes[0].axis('off')
+                    img_B_np = np.array(image_B)
+                    if img_B_np.ndim == 3 and img_B_np.shape[0] == 3:  # CHW format
+                        img_B_np = np.transpose(img_B_np, (1, 2, 0))
+                    img_B_np = np.clip(img_B_np, 0, 1)  # Ensure values are in [0,1]
+                    axes[1].imshow(img_B_np)
+                    axes[1].set_title(f'Image B - Score: {score_B_val:.3f}')
+                    axes[1].axis('off')
+                    # Add prediction info
+                    winner = "A" if score_diff > 0 else "B"
+                    plt.suptitle(f'Step {step} - Sample {i} | Predicted Winner: Image {winner} | Diff: {score_diff:.3f}', fontsize=14)
+                    plt.tight_layout()
+                    # Log figure to tensorboard
+                    writer.add_figure(f'train_viz/sample_{i}_comparison', fig, step)
+                    plt.close(fig)
+                except Exception as viz_error:
+                    print(f"Warning: Could not extract images for visualization: {viz_error}")
+                    continue
+            # Log aggregate statistics
+            all_scores_A = rewards_A.float().detach().cpu().numpy()
+            all_scores_B = rewards_B.float().detach().cpu().numpy()
+            writer.add_histogram('train_viz/all_scores_A', all_scores_A, step)
+            writer.add_histogram('train_viz/all_scores_B', all_scores_B, step)
+            writer.add_scalar('train_viz/mean_score_A', float(all_scores_A.mean()), step)
+            writer.add_scalar('train_viz/mean_score_B', float(all_scores_B.mean()), step)
+            writer.add_scalar('train_viz/mean_score_diff', float((all_scores_A - all_scores_B).mean()), step)
+        except Exception as e:
+            print(f"Error in training visualization: {e}")
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if isinstance(self.model, PeftModel):
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+            if self.hp_search_backend is None and trial is None:
+                self.store_flos()
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+            os.makedirs(output_dir, exist_ok=True)
+            # TODO: Just Temp
+            self.save_model(output_dir, _internal_call=True)
+            # pdb.set_trace()
+            if not self.args.save_full_model:
+                non_lora_weights = get_peft_state_non_lora_maybe_zero_3(
+                    self.model.named_parameters(), require_grad_only=True
+                )
+                torch.save(
+                    non_lora_weights,
+                    os.path.join(output_dir, "non_lora_state_dict.pth"),
+                )
+            # safetensors.torch.save(non_lora_weights, os.path.join(output_dir, "non_lora_model.safetensors"))
+            if not self.args.save_only_model:
+                # Save optimizer and scheduler
+                self._save_optimizer_and_scheduler(output_dir)
+                # Save RNG state
+                self._save_rng_state(output_dir)
+        else:
+            super(RewardTrainer, self)._save_checkpoint(model, trial, metrics)
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        # If we are executing this function, we are the process zero, so we don't check for that.
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Saving model checkpoint to {output_dir}")
+        # pdb.set_trace()
+        supported_classes = (
+            (PreTrainedModel,)
+            if not is_peft_available()
+            else (PreTrainedModel, PeftModel)
+        )
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        if not isinstance(self.model, supported_classes):
+            if state_dict is None:
+                state_dict = self.model.state_dict()
+            if isinstance(self.accelerator.unwrap_model(self.model), supported_classes):
+                self.accelerator.unwrap_model(self.model).save_pretrained(
+                    output_dir,
+                    state_dict=state_dict,
+                    safe_serialization=self.args.save_safetensors,
+                )
+            else:
+                logger.info(
+                    "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
+                )
+                if self.args.save_safetensors:
+                    safetensors.torch.save_file(
+                        state_dict,
+                        os.path.join(output_dir, SAFE_WEIGHTS_NAME),
+                        metadata={"format": "pt"},
+                    )
+                else:
+                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+        else:
+            if not self.args.save_full_model:
+                state_dict = {k: v for k, v in state_dict.items() if "wte" not in k}
+                self.model.save_pretrained(
+                    output_dir,
+                    state_dict=state_dict,
+                    safe_serialization=self.args.save_safetensors,
+                )
+            else:
+                torch.save(state_dict, os.path.join(output_dir, "model.pth"))
+        if self.tokenizer is not None:
+            os.makedirs(os.path.join(output_dir, "tokenizer"), exist_ok=True)
+            self.tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer"))
+        # Good practice: save your training arguments together with the trained model
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+        # pdb.set_trace()
+def compute_multi_attr_accuracy(eval_pred, metainfo_idxs=None) -> Dict[str, float]:
+    predictions, labels = eval_pred
+    metrics = {}
+    pred_curr = predictions
+    label_curr = labels.squeeze(1)
+    total_count = np.sum(label_curr != 0)
+    rewards_chosen = pred_curr[:, 0]
+    rewards_rejected = pred_curr[:, 1]
+    rewards_chosen_avg = np.sum(rewards_chosen) / total_count
+    rewards_rejected_avg = np.sum(rewards_rejected) / total_count
+    accuracy = np.sum(rewards_chosen > rewards_rejected) / total_count
+    metrics.update(
+        {
+            f"Acc": accuracy,
+            f"R_chosen_avg": rewards_chosen_avg,
+            f"R_rejected_avg": rewards_rejected_avg,
+        }
+    )
+    return metrics

hpsv3/model/test_differentiable.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+from collections.abc import Mapping
+import torch
+import numpy as np
+from PIL import Image
+import huggingface_hub
+from hpsv3.dataset.utils import process_vision_info
+from hpsv3.dataset.data_collator_qwen import prompt_with_special_token, prompt_without_special_token, INSTRUCTION
+from hpsv3.utils.parser import ModelConfig, PEFTLoraConfig, TrainingConfig, DataConfig, parse_args_with_yaml
+from hpsv3.train import create_model_and_processor
+from pathlib import Path
+_MODEL_CONFIG_PATH = Path(__file__).parent / f"config/"
+class HPSv3RewardInferencer():
+    def __init__(self, config_path=None, checkpoint_path=None, device='cuda', differentiable=False):
+        if config_path is None:
+                config_path = os.path.join(_MODEL_CONFIG_PATH, 'HPSv3_7B.yaml')
+        if checkpoint_path is None:
+            checkpoint_path = os.path.join(huggingface_hub.hf_hub_download("xilanhua12138/HPSv3"), 'model.pth')
+        (data_config, training_args, model_config, peft_lora_config), config_path = (
+            parse_args_with_yaml(
+                (DataConfig, TrainingConfig, ModelConfig, PEFTLoraConfig), config_path, is_train=False
+            )
+        )
+        training_args.output_dir = os.path.join(
+            training_args.output_dir, config_path.split("/")[-1].split(".")[0]
+        )
+        model, processor, peft_config = create_model_and_processor(
+            model_config=model_config,
+            peft_lora_config=peft_lora_config,
+            training_args=training_args,
+            differentiable=differentiable,
+        )
+        self.device = device
+        self.use_special_tokens = model_config.use_special_tokens
+        state_dict = torch.load(checkpoint_path , map_location="cpu")
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
+        model.load_state_dict(state_dict, strict=False)
+        model.eval()
+        self.model = model
+        self.processor = processor
+        self.model.to(self.device)
+        self.data_config = data_config
+    def _pad_sequence(self, sequences, attention_mask, max_len, padding_side='right'):
+        """
+        Pad the sequences to the maximum length.
+        """
+        assert padding_side in ['right', 'left']
+        if sequences.shape[1] >= max_len:
+            return sequences, attention_mask
+        pad_len = max_len - sequences.shape[1]
+        padding = (0, pad_len) if padding_side == 'right' else (pad_len, 0)
+        sequences_padded = torch.nn.functional.pad(sequences, padding, 'constant', self.processor.tokenizer.pad_token_id)
+        attention_mask_padded = torch.nn.functional.pad(attention_mask, padding, 'constant', 0)
+        return sequences_padded, attention_mask_padded
+    def _prepare_input(self, data):
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        if isinstance(data, Mapping):
+            return type(data)({k: self._prepare_input(v) for k, v in data.items()})
+        elif isinstance(data, (tuple, list)):
+            return type(data)(self._prepare_input(v) for v in data)
+        elif isinstance(data, torch.Tensor):
+            kwargs = {"device": self.device}
+            ## TODO: Maybe need to add dtype
+            # if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)):
+            #     # NLP models inputs are int/uint and those get adjusted to the right dtype of the
+            #     # embedding. Other models such as wav2vec2's inputs are already float and thus
+            #     # may need special handling to match the dtypes of the model
+            #     kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
+            return data.to(**kwargs)
+        return data
+    def _prepare_inputs(self, inputs):
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        inputs = self._prepare_input(inputs)
+        if len(inputs) == 0:
+            raise ValueError
+        return inputs
+    def prepare_batch(self, image_paths, prompts):
+        max_pixels = 256 * 28 * 28
+        min_pixels = 256 * 28 * 28
+        message_list = []
+        for text, image in zip(prompts, image_paths):
+            out_message = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "image": image,
+                            "min_pixels": max_pixels,
+                            "max_pixels": max_pixels,
+                        },
+                        {
+                            "type": "text",
+                            "text": (
+                                INSTRUCTION.format(text_prompt=text)
+                                + prompt_with_special_token
+                                if self.use_special_tokens
+                                else prompt_without_special_token
+                            ),
+                        },
+                    ],
+                }
+            ]
+            message_list.append(out_message)
+        image_inputs, _ = process_vision_info(message_list)
+        batch = self.processor(
+            text=self.processor.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True),
+            images=image_inputs,
+            padding=True,
+            return_tensors="pt",
+            videos_kwargs={"do_rescale": True},
+        )
+        batch = self._prepare_inputs(batch)
+        return batch
+    def reward(self, image_paths, prompts):
+        batch = self.prepare_batch(image_paths, prompts)
+        rewards = self.model(
+            return_dict=True,
+            **batch
+        )["logits"]
+        return rewards
+if __name__ == "__main__":
+    config_path = '/preflab/shuiyunhao/tasks/HPSv3_official/hpsv3/config/HPSv3_7B.yaml'
+    checkpoint_path = '/preflab/shuiyunhao/tasks/HPSv3_official/checkpoints/HPSv3_7B/model.pth'
+    device = 'cuda'
+    dtype = torch.bfloat16
+    inferencer = HPSv3RewardInferencer(config_path, checkpoint_path, differentiable=True, device=device)
+    images = [
+        torch.from_numpy(np.array(Image.open("assets/example1.png"))),
+        torch.from_numpy(np.array(Image.open("assets/example2.png")))
+    ]
+    prompts = [
+        "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker",
+        "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker"
+    ]
+    rewards = inferencer.reward(images, prompts)
+    print(rewards[0][0].item()) # miu and sigma. we select miu as the final output
+    print(rewards[1][0].item())
+    loss = rewards[0][0]
+    print(f"Loss value: {loss.item()}")
+    print(f"Loss requires_grad: {loss.requires_grad}")
+    if loss.requires_grad:
+        loss.backward(retain_graph=True)
+        has_grad = False
+        for name, param in inferencer.model.named_parameters():
+            if param.grad is not None:
+                has_grad = True
+                print(f"参数 {name} 有梯度: {param.grad.norm().item():.6f}")
+                break
+        if has_grad:
+            print("has grad")
+        else:
+            print("NO GRAD!!")
+    else:
+        print("Final loss does not require gradient computation.")
+    # Compare
+    img_pil = Image.open("assets/example1.png").convert('RGB')
+    non_diff_result = inferencer.processor.preprocess(
+        images=[img_pil],
+        return_tensors="pt"
+    )
+    img_tensor = torch.from_numpy(np.array(img_pil)).float().permute(2, 0, 1) / 255.0
+    diff_result = inferencer.processor.preprocess_tensor(img_tensor)
+    if non_diff_result['pixel_values'].shape == diff_result['pixel_values'].shape:
+        diff = torch.abs(non_diff_result['pixel_values'] - diff_result['pixel_values']).mean()
+        if diff.item() < 0.01:
+            print("Right")
+        else:
+            print("Different outputs")
+    else:
+        print("Shape mismatch between non-differentiable and differentiable outputs.")

hpsv3/train.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import json
+import os
+import fire
+from dataclasses import asdict
+from functools import partial
+import torch
+import torch.distributed as dist
+from hpsv3.model.qwen2vl_trainer import (
+    Qwen2VLRewardModelBT,
+    VLMRewardTrainer,
+    compute_multi_attr_accuracy,
+    PartialEmbeddingUpdateCallback,
+)
+from hpsv3.dataset.pairwise_dataset import PairwiseOriginalDataset
+from hpsv3.dataset.data_collator_qwen import QWen2VLDataCollator
+from hpsv3.utils.parser import ModelConfig, PEFTLoraConfig, TrainingConfig, DataConfig
+from hpsv3.utils.training_utils import load_model_from_checkpoint, find_target_linear_names
+from hpsv3.utils.parser import parse_args_with_yaml
+from transformers import AutoProcessor
+from peft import LoraConfig, get_peft_model
+from trl import get_kbit_device_map, get_quantization_config
+from hpsv3.model.differentiable_image_processor import Qwen2VLImageProcessor
+try:
+    import flash_attn
+except ImportError:
+    flash_attn = None
+    print("Flash Attention is not installed. Falling to SDPA.")
+def create_model_and_processor(
+    model_config,
+    peft_lora_config,
+    training_args,
+    cache_dir=None,
+    differentiable=False,
+):
+    # create model
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_config)
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+        use_cache=False
+    )
+    # create processor and set padding
+    processor = AutoProcessor.from_pretrained(
+        model_config.model_name_or_path, padding_side="right", cache_dir=cache_dir
+    )
+    if differentiable:
+        processor.image_processor = Qwen2VLImageProcessor()
+    special_token_ids = None
+    if model_config.use_special_tokens:
+        special_tokens = ["<|Reward|>"]
+        processor.tokenizer.add_special_tokens(
+            {"additional_special_tokens": special_tokens}
+        )
+        special_token_ids = processor.tokenizer.convert_tokens_to_ids(special_tokens)
+    model = Qwen2VLRewardModelBT.from_pretrained(
+        model_config.model_name_or_path,
+        output_dim=model_config.output_dim,
+        reward_token=model_config.reward_token,
+        special_token_ids=special_token_ids,
+        torch_dtype=torch_dtype,
+        attn_implementation=(
+            "flash_attention_2" if not training_args.disable_flash_attn2 and flash_attn is not None else "sdpa"
+        ),
+        cache_dir=cache_dir,
+        rm_head_type=model_config.rm_head_type,
+        rm_head_kwargs=model_config.rm_head_kwargs,
+        **model_kwargs,
+    )
+    if model_config.use_special_tokens:
+        model.resize_token_embeddings(len(processor.tokenizer))
+    if training_args.bf16:
+        model.to(torch.bfloat16)
+    if training_args.fp16:
+        model.to(torch.float16)
+    model.rm_head.to(torch.float32)
+    # create lora and peft model
+    if peft_lora_config.lora_enable:
+        target_modules = find_target_linear_names(
+            model,
+            num_lora_modules=peft_lora_config.num_lora_modules,
+            lora_namespan_exclude=peft_lora_config.lora_namespan_exclude,
+        )
+        peft_config = LoraConfig(
+            target_modules=target_modules,
+            r=peft_lora_config.lora_r,
+            lora_alpha=peft_lora_config.lora_alpha,
+            lora_dropout=peft_lora_config.lora_dropout,
+            task_type=peft_lora_config.lora_task_type,
+            use_rslora=peft_lora_config.use_rslora,
+            bias="none",
+            modules_to_save=peft_lora_config.lora_modules_to_save,
+        )
+        model = get_peft_model(model, peft_config)
+    else:
+        peft_config = None
+    model.config.tokenizer_padding_side = processor.tokenizer.padding_side
+    model.config.pad_token_id = processor.tokenizer.pad_token_id
+    return model, processor, peft_config
+def save_configs_to_json(data_config, training_args, model_config, peft_lora_config):
+    """
+    Save all configurations to a JSON file.
+    """
+    config_dict = {
+        "data_config": asdict(data_config),
+        "training_args": asdict(training_args),
+        "model_config": asdict(model_config),
+        "peft_lora_config": asdict(peft_lora_config),
+    }
+    # del information about local device
+    del config_dict["training_args"]["local_rank"]
+    del config_dict["training_args"]["_n_gpu"]
+    save_path = os.path.join(training_args.output_dir, "model_config.json")
+    os.makedirs(training_args.output_dir, exist_ok=True)
+    print(training_args.output_dir)
+    with open(save_path, "w") as f:
+        json.dump(config_dict, f, indent=4)
+def set_requires_grad(parameters, requires_grad):
+    for p in parameters:
+        p.requires_grad = requires_grad
+def train(config, local_rank=0, debug=False):
+    ## ===> Step 1: Parse arguments
+    (data_config, training_args, model_config, peft_lora_config), config_path = (
+        parse_args_with_yaml(
+            (DataConfig, TrainingConfig, ModelConfig, PEFTLoraConfig), config, is_train=True
+        )
+    )
+    training_args.output_dir = os.path.join(
+        training_args.output_dir, config.split("/")[-1].split(".")[0]
+    )
+    training_args.logging_dir = training_args.output_dir
+    # check valid (lora config)
+    assert not (
+        peft_lora_config.lora_enable and model_config.freeze_llm
+    ), "When using LoRA, the LLM should not be frozen. If you want to freeze the LLM, please disable LoRA."
+    if not peft_lora_config.lora_enable:
+        assert (
+            not peft_lora_config.vision_lora
+        ), "Error: model_config.lora_enable is not enabled, but model_config.vision_lora is enabled."
+    else:
+        if peft_lora_config.lora_namespan_exclude is None:
+            peft_lora_config.lora_namespan_exclude = []
+        if not peft_lora_config.vision_lora:
+            peft_lora_config.lora_namespan_exclude += ["visual"]
+    ## ===> Step 2: Load model and configure
+    model, processor, peft_config = create_model_and_processor(
+        model_config=model_config,
+        peft_lora_config=peft_lora_config,
+        training_args=training_args,
+    )
+    ## load model
+    if training_args.load_from_pretrained is not None:
+        model, checkpoint_step = load_model_from_checkpoint(
+            model,
+            training_args.load_from_pretrained,
+            training_args.load_from_pretrained_step,
+        )
+    model.train()
+    if peft_lora_config.lora_enable:
+        model_to_configure = model.model
+    else:
+        model_to_configure = model
+        # set requires_grad for LLM
+        set_requires_grad(
+            model_to_configure.model.parameters(), not model_config.freeze_llm
+        )
+        set_requires_grad(model_to_configure.model.embed_tokens.parameters(), False)
+    if not peft_lora_config.vision_lora:
+        # set requires_grad for visual encoder and merger
+        set_requires_grad(
+            model_to_configure.visual.parameters(), not model_config.freeze_vision_tower
+        )
+        set_requires_grad(
+            model_to_configure.visual.merger.parameters(), model_config.tune_merger
+        )
+    if model_config.trainable_visual_layers: # This is inverse order to index of model.visual.blocks, set -1 to unfreeze all layers
+        assert model_config.trainable_visual_layers <= len(model_to_configure.visual.blocks), "trainable_visual_layers should be less than or equal to the number of visual blocks"
+        freeze_layer_num = len(model_to_configure.visual.blocks) - model_config.trainable_visual_layers if model_config.trainable_visual_layers > 0 else 0
+        for index, layer in enumerate(model_to_configure.visual.blocks):
+            if index < freeze_layer_num:
+                set_requires_grad(layer.parameters(), False)
+            else:
+                set_requires_grad(layer.parameters(), True)
+    # set requires_grad for regression head
+    set_requires_grad(model_to_configure.rm_head.parameters(), True)
+    ## ===> Step 3: Load Dataset and configure
+    train_dataset = PairwiseOriginalDataset(
+        data_config.train_json_list,
+        data_config.soft_label,
+        data_config.confidence_threshold,
+    )
+    test_set_dict = {}
+    for item in data_config.test_json_list:
+        test_set_dict[item[0]] = PairwiseOriginalDataset(
+            item[1],
+            data_config.soft_label,
+            data_config.confidence_threshold,
+        )
+    print(f"===> Selected {len(train_dataset)} samples for training.")
+    for key, value in test_set_dict.items():
+        print(f"===> Selected {len(value)} samples for {key} testing.")
+    num_gpu = int(os.environ.get("WORLD_SIZE", 1))
+    data_collator = QWen2VLDataCollator(
+        processor,
+        max_pixels=data_config.max_pixels,
+        min_pixels=data_config.min_pixels,
+        with_instruction=data_config.with_instruction,
+        use_special_tokens=model_config.use_special_tokens,
+    )
+    compute_metrics = partial(compute_multi_attr_accuracy)
+    actual_batch_size = (
+        training_args.per_device_train_batch_size
+        * training_args.gradient_accumulation_steps
+        * num_gpu
+    )
+    total_steps = (
+        training_args.num_train_epochs * len(train_dataset) // actual_batch_size
+    )
+    if training_args.save_epochs is not None:
+        training_args.save_steps = round(
+            training_args.save_epochs * len(train_dataset) / actual_batch_size
+        )
+    if training_args.eval_epochs is not None:
+        training_args.eval_steps = round(
+            training_args.eval_epochs * len(train_dataset) / actual_batch_size
+        )
+    if training_args.logging_epochs is not None:
+        training_args.logging_steps = round(
+            training_args.logging_epochs * len(train_dataset) / actual_batch_size
+        )
+    if training_args.local_rank == -1 or training_args.local_rank == 0:
+        print(f"===> Using {num_gpu} GPUs.")
+        print(f"===> Total Batch Size: {actual_batch_size}")
+        print(f"===> Training Epochs: {training_args.num_train_epochs}")
+        print(f"===> Total Steps: {total_steps}")
+        print(f"===> Save Steps: {training_args.save_steps}")
+        print(f"===> Eval Steps: {training_args.eval_steps}")
+        print(f"===> Logging Steps: {training_args.logging_steps}")
+    ## ===> Step 4: Save configs for re-check
+    if training_args.local_rank == -1 or training_args.local_rank == 0:
+        save_configs_to_json(data_config, training_args, model_config, peft_lora_config)
+    print(train_dataset)
+    ## ===> Step 5: Start Training!
+    special_token_ids = model.special_token_ids
+    callbacks = []
+    if special_token_ids is not None:
+        callbacks.append(PartialEmbeddingUpdateCallback(special_token_ids))
+    trainer = VLMRewardTrainer(
+        model=model,
+        compute_metrics=compute_metrics,
+        data_collator=data_collator,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=(test_set_dict if training_args.conduct_eval else None),
+        peft_config=peft_config,
+        callbacks=callbacks,
+        loss_type=model_config.loss_type,
+        loss_hyperparameters=model_config.loss_hyperparameters,
+        tokenizer=processor.tokenizer,
+        tied_threshold=data_config.tied_threshold,
+        visualization_steps=training_args.visualization_steps,
+        max_viz_samples=training_args.max_viz_samples,
+    )
+    trainer.train()
+    if training_args.local_rank == -1 or training_args.local_rank == 0:
+        model_state_dict = model.state_dict()
+        torch.save(
+            model_state_dict, os.path.join(training_args.output_dir, "final_model.pth")
+        )
+        model.config.save_pretrained(training_args.output_dir)
+if __name__ == "__main__":
+    fire.Fire(train)

hpsv3/utils/parser.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import sys
+import yaml
+from pathlib import Path
+from typing import Any, Optional, Union, Tuple, List, Literal
+from omegaconf import OmegaConf
+from transformers import HfArgumentParser
+from dataclasses import dataclass, field
+from transformers import TrainingArguments
+@dataclass
+class DataConfig:
+    train_json_list: List[str] = field(default_factory=lambda: ["/path/to/dataset/meta_data.json"])
+    val_json_list: List[str] = field(default_factory=lambda: ["/path/to/dataset/meta_data.json"])
+    test_json_list: List[str] = field(default_factory=lambda: ["/path/to/dataset/meta_data.json"])
+    soft_label: bool = False
+    confidence_threshold: Optional[float] = None
+    max_pixels: Optional[int] = 256 * 28 * 28  # Default max pixels
+    min_pixels: Optional[int] = 256 * 28 * 28
+    with_instruction: bool = True
+    tied_threshold: Optional[float] = None
+@dataclass
+class TrainingConfig(TrainingArguments):
+    max_grad_norm: Optional[float] = 1.0
+    dataset_num_proc: Optional[int] = None
+    center_rewards_coefficient: Optional[float] = None
+    disable_flash_attn2: bool = field(default=False)
+    disable_dropout: bool = field(default=False)
+    vision_lr: Optional[float] = None
+    merger_lr: Optional[float] = None
+    rm_head_lr: Optional[float] = None
+    special_token_lr: Optional[float] = None
+    conduct_eval: Optional[bool] = True
+    load_from_pretrained: str = None
+    load_from_pretrained_step: int = None
+    logging_epochs: Optional[float] = None
+    eval_epochs: Optional[float] = None
+    save_epochs: Optional[float] = None
+    remove_unused_columns: Optional[bool] = False
+    save_full_model: Optional[bool] = False
+    # Visualization parameters
+    visualization_steps: Optional[int] = 100
+    max_viz_samples: Optional[int] = 4
+@dataclass
+class PEFTLoraConfig:
+    lora_enable: bool = False
+    vision_lora: bool = False
+    lora_r: int = 16
+    lora_alpha: int = 32
+    lora_dropout: float = 0.05
+    lora_target_modules: Optional[List[str]] = None
+    lora_namespan_exclude: Optional[List[str]] = None
+    lora_modules_to_save: Optional[List[str]] = None
+    lora_task_type: str = "CAUSAL_LM"
+    use_rslora: bool = False
+    num_lora_modules: int = -1
+    def __post_init__(self):
+        if (
+            isinstance(self.lora_target_modules, list)
+            and len(self.lora_target_modules) == 1
+        ):
+            self.lora_target_modules = self.lora_target_modules[0]
+        if (
+            isinstance(self.lora_namespan_exclude, list)
+            and len(self.lora_namespan_exclude) == 1
+        ):
+            self.lora_namespan_exclude = self.lora_namespan_exclude[0]
+@dataclass
+class ModelConfig:
+    model_name_or_path: Optional[str] = None
+    model_revision: str = "main"
+    rm_head_type: str = "default"
+    rm_head_kwargs: Optional[dict] = None
+    output_dim: int = 1
+    use_special_tokens: bool = False
+    freeze_vision_tower: bool = field(default=False)
+    freeze_llm: bool = field(default=False)
+    tune_merger: bool = field(default=False)
+    trainable_visual_layers: Optional[int] = -1
+    torch_dtype: Optional[Literal["auto", "bfloat16", "float16", "float32"]] = None
+    trust_remote_code: bool = False
+    attn_implementation: Optional[str] = None
+    load_in_8bit: bool = False
+    load_in_4bit: bool = False
+    bnb_4bit_quant_type: Literal["fp4", "nf4"] = "nf4"
+    use_bnb_nested_quant: bool = False
+    reward_token: Literal["last", "mean", "special"] = "last"
+    loss_type: Literal["bt", "reg", "btt", "margin", "constant_margin", "scaled"] = (
+        "regular"
+    )
+    loss_hyperparameters: dict = field(default_factory=lambda: {})
+    checkpoint_path: Optional[str] = None
+    def __post_init__(self):
+        if self.load_in_8bit and self.load_in_4bit:
+            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
+        # if isinstance(self.lora_target_modules, list) and len(self.lora_target_modules) == 1:
+        #     self.lora_target_modules = self.lora_target_modules[0]
+        # if isinstance(self.lora_namespan_exclude, list) and len(self.lora_namespan_exclude) == 1:
+        #     self.lora_namespan_exclude = self.lora_namespan_exclude[0]
+########## Functions for get trainable modules' parameters ##########
+def parse_args_with_yaml(
+    dataclass_types: Tuple[type, ...],
+    config_path: str = None,
+    allow_extra_keys: bool = True,
+    is_train: bool = True,
+) -> Tuple[Any, ...]:
+    """
+    Parse arguments using HfArgumentParser with OmegaConf for YAML support.
+    Args:
+        dataclass_types: Tuple of dataclass types for HfArgumentParser
+        args: Optional arguments (if None, will read from sys.argv)
+        allow_extra_keys: Whether to allow extra keys in config
+    Returns:
+        Tuple of parsed dataclass instances
+    """
+    # Read arguments from command line or provided args
+    # Load YAML config and merge with command line overrides
+    args = OmegaConf.to_container(OmegaConf.load(config_path))
+    if not is_train:
+        args.pop('deepspeed', None)
+    # Parse with HfArgumentParser
+    parser = HfArgumentParser(dataclass_types)
+    return parser.parse_dict(args, allow_extra_keys=allow_extra_keys), config_path
+if __name__ == "__main__":
+    data_config, training_args, model_config, peft_lora_config = parse_args_with_yaml(
+        (DataConfig, TrainingConfig, ModelConfig, PEFTLoraConfig)
+    )

hpsv3/utils/training_utils.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+import os
+import glob
+import safetensors
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(
+                    f"Parameter {name} is not available in ZeRO-3, please check the ZeRO-3 status."
+                )
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {
+        k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()
+    }
+    return to_return
+def _insert_adapter_name_into_state_dict(
+    state_dict: dict[str, torch.Tensor], adapter_name: str, parameter_prefix: str
+) -> dict[str, torch.Tensor]:
+    """Utility function to remap the state_dict keys to fit the PEFT model by inserting the adapter name."""
+    peft_model_state_dict = {}
+    for key, val in state_dict.items():
+        if parameter_prefix in key:
+            suffix = key.split(parameter_prefix)[1]
+            if "." in suffix:
+                suffix_to_replace = ".".join(suffix.split(".")[1:])
+                key = key.replace(
+                    suffix_to_replace, f"{adapter_name}.{suffix_to_replace}"
+                )
+            else:
+                key = f"{key}.{adapter_name}"
+            peft_model_state_dict[key] = val
+        else:
+            peft_model_state_dict[key] = val
+    return peft_model_state_dict
+def save_video(tensor, path):
+    from torchvision.io import write_video
+    tensor = tensor * 255.0
+    tensor = tensor.permute(0, 2, 3, 1)
+    tensor = tensor.clamp(0, 255).byte()
+    write_video(path, tensor, 4, video_codec="h264")
+def load_model_from_checkpoint(model, checkpoint_dir, checkpoint_step):
+    checkpoint_paths = glob.glob(os.path.join(checkpoint_dir, "checkpoint-*"))
+    checkpoint_paths.sort(key=lambda x: int(x.split("-")[-1]), reverse=True)
+    if checkpoint_step is None or checkpoint_step == -1:
+        # get the latest checkpoint
+        checkpoint_path = checkpoint_paths[0]
+        print(
+            f"===> Checkpoint step is not provided, using the latest checkpoint: {checkpoint_path}"
+        )
+    else:
+        checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint-{checkpoint_step}")
+        if checkpoint_path not in checkpoint_paths:
+            checkpoint_path = checkpoint_paths[0]
+            print(
+                f"===> Checkpoint step {checkpoint_step} not found, using the latest checkpoint: {checkpoint_path}"
+            )
+        else:
+            print(
+                f"===> Checkpoint step {checkpoint_step} found, using the specified checkpoint: {checkpoint_path}"
+            )
+    checkpoint_step = checkpoint_path.split("checkpoint-")[-1].split("/")[0]
+    full_ckpt = os.path.join(checkpoint_path, "model.pth")
+    lora_ckpt = os.path.join(checkpoint_path, "adapter_model.safetensors")
+    non_lora_ckpt = os.path.join(checkpoint_path, "non_lora_state_dict.pth")
+    if os.path.exists(full_ckpt):
+        model_state_dict = torch.load(full_ckpt, map_location="cpu")
+        model.load_state_dict(model_state_dict)
+    else:
+        lora_state_dict = safetensors.torch.load_file(lora_ckpt)
+        non_lora_state_dict = torch.load(non_lora_ckpt, map_location="cpu")
+        lora_state_dict = _insert_adapter_name_into_state_dict(
+            lora_state_dict, adapter_name="default", parameter_prefix="lora_"
+        )
+        model_state_dict = model.state_dict()
+        model_state_dict.update(non_lora_state_dict)
+        model_state_dict.update(lora_state_dict)
+        model.load_state_dict(model_state_dict)
+    return model, checkpoint_step
+def find_target_linear_names(
+    model, num_lora_modules=-1, lora_namespan_exclude=[], verbose=False
+):
+    """
+    Find the target linear modules for LoRA.
+    """
+    linear_cls = torch.nn.Linear
+    embedding_cls = torch.nn.Embedding
+    lora_module_names = []
+    for name, module in model.named_modules():
+        if any(ex_keyword in name for ex_keyword in lora_namespan_exclude):
+            # print(f"Excluding module: {name}")
+            continue
+        if isinstance(module, (linear_cls, embedding_cls)):
+            lora_module_names.append(name)
+    if num_lora_modules > 0:
+        lora_module_names = lora_module_names[-num_lora_modules:]
+    if verbose:
+        print(f"Found {len(lora_module_names)} lora modules: {lora_module_names}")
+    return lora_module_names

pretrained_models/download_pretrained_models.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+# Create pretrained_models directory
+mkdir -p pretrained_models
+# Model list (using array instead of associative array to avoid ordering issues)
+models=(
+    "black-forest-labs/FLUX.1-dev:FLUX1-dev"
+    "stabilityai/stable-diffusion-3-medium-diffusers:SD3-medium"
+    "stabilityai/stable-diffusion-xl-base-1.0:SDXL-base"
+    "Kwai-Kolors/Kolors-diffusers:Kolors"
+    "THUDM/CogView4-6B:CogView4"
+    "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS:PixArt"
+    "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers:HunyuanDiT"
+    "FoundationVision/Infinity"
+    "google/flan-t5-xl"
+    "Qwen/Qwen2-VL-7B-Instruct: Qwen2-VL-7B-Instruct"
+    "Qwen/Qwen2-VL-2B-Instruct: Qwen2-VL-2B-Instruct"
+)
+# Create tmux session and set up the first window
+model_info="${models[0]}"
+model_path="${model_info%:*}"
+window_name="${model_info#*:}"
+local_dir="${model_path##*/}"
+# Set the first window name directly when creating session
+tmux new-session -d -s download_pretrained_model -n "$window_name"
+# Give tmux some time to initialize
+sleep 0.5
+# Set commands for the first window
+tmux send-keys -t download_pretrained_model:"$window_name" "conda activate hpsv3" Enter
+tmux send-keys -t download_pretrained_model:"$window_name" "export HF_ENDPOINT=https://alpha.hf-mirror.com" Enter
+tmux send-keys -t download_pretrained_model:"$window_name" "cd pretrained_models" Enter
+tmux send-keys -t download_pretrained_model:"$window_name" "while true; do huggingface-cli download $model_path --local-dir $local_dir && break || sleep 60; done" Enter
+# Create new windows for remaining models
+for i in $(seq 1 $((${#models[@]} - 1))); do
+    model_info="${models[$i]}"
+    model_path="${model_info%:*}"
+    window_name="${model_info#*:}"
+    local_dir="${model_path##*/}"
+    # Create new window
+    tmux new-window -t download_pretrained_model -n "$window_name"
+    # Add small delay to ensure window creation is complete
+    sleep 0.2
+    tmux send-keys -t download_pretrained_model:"$window_name" "conda activate hpsv3" Enter
+    tmux send-keys -t download_pretrained_model:"$window_name" "export HF_ENDPOINT=https://alpha.hf-mirror.com" Enter
+    tmux send-keys -t download_pretrained_model:"$window_name" "cd pretrained_models" Enter
+    tmux send-keys -t download_pretrained_model:"$window_name" "while true; do huggingface-cli download $model_path --local-dir $local_dir && break || sleep 60; done" Enter
+done
+# Switch to the first window (using the first model's window name)
+first_window_name="${models[0]#*:}"
+tmux select-window -t download_pretrained_model:"$first_window_name"
+echo "Created tmux session 'download_pretrained_model' and started downloading all models"
+echo "Use 'tmux attach -t download_pretrained_model' to view download progress"
+echo "Use Ctrl+B then press number keys to switch between different download windows"
+echo "Use 'tmux list-windows -t download_pretrained_model' to view all windows"
+echo "Use 'tmux kill-session -t download_pretrained_model' to end the session"

pyproject.toml ADDED Viewed

	@@ -0,0 +1,64 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "hpsv3"
+version = "1.0.0"
+description = "HPSv3: Towards Wide-Spectrum Human Preference Score - A VLM-based preference model for image quality assessment"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {text = "MIT"}
+authors = [
+    {name = "Yunhao Shui"},
+    {name = "Yuhang Ma"},
+]
+keywords = ["machine learning", "computer vision", "human preference", "image quality", "VLM", "multimodal"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "torch>=2.0.0",
+    "torchvision>=0.15.0",
+    "transformers==4.45.2",
+    "accelerate>=0.20.0",
+    "datasets>=2.10.0",
+    "diffusers>=0.20.0",
+    "Pillow>=9.0.0",
+    "numpy>=1.20.0",
+    "tqdm>=4.60.0",
+    "pyyaml>=6.0",
+    "omegaconf>=2.3.0",
+    "opencv-python>=4.5.0",
+    "safetensors>=0.3.0",
+    "einops>=0.6.0",
+    "qwen-vl-utils>=0.0.8",
+    "timm>=0.9.0",
+    "deepspeed>=0.12.0",
+    "peft>=0.8.0",
+    "trl>=0.7.0",
+    "fire>=0.7.0"
+]
+[project.urls]
+Homepage = "https://mizzenai.github.io/HPSv3/"
+Source = "https://github.com/MizzenAI/HPSv3"
+Documentation = "https://github.com/MizzenAI/HPSv3/blob/main/README.md"
+Paper = "https://arxiv.org/abs/2411.07232"
+[tool.setuptools.packages.find]
+include = ["hpsv3*", "generate*", "evaluate*"]
+[tool.setuptools.package-data]
+hpsv3 = ["config/*.yaml", "config/ds_config/*.json"]

requirements.txt CHANGED Viewed

@@ -190,6 +190,6 @@ widgetsnbextension==4.0.14
 xxhash==3.5.0
 yarl==1.20.1
 zipp==3.22.0
 hpsv3
-hpsv2
-# flash-attn==2.7.4.post1

 xxhash==3.5.0
 yarl==1.20.1
 zipp==3.22.0
+# flash-attn==2.7.4.post1
 hpsv3
+hpsv2