Spaces:

sdsdgwe
/

HPSv3

Runtime error

File size: 21,077 Bytes

122ed9f
dcbdbe2
 
 
 
 
6493431
dcbdbe2
122ed9f
dcbdbe2
 
 
 
 
 
 
 
 
122ed9f
dcbdbe2
 
 
 
 
 
 
 
 
 
 
 
 
 
6493431
dcbdbe2
 
 
 
 
 
 
 
 
6493431
dcbdbe2
 
 
 
6493431
dcbdbe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6493431
dcbdbe2
 
6493431
dcbdbe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6493431
 
dcbdbe2

import gradio as gr
import torch
import os
import sys
from PIL import Image
import uuid
import huggingface_hub
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from hpsv3.inference import HPSv3RewardInferencer
try:
    import ImageReward as RM
    from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
except:
    RM = None
    create_model_and_transforms = None
    get_tokenizer = None
    print("ImageReward or HPSv2 dependencies not found. Skipping those models.")

from transformers import AutoProcessor, AutoModel

# --- Configuration ---
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DTYPE = torch.bfloat16 if DEVICE == 'cuda' else torch.float32

# --- Model Configuration ---
MODEL_CONFIGS = {
    "HPSv3_7B": {
        "name": "HPSv3 7B",
        "type": "hpsv3"
    },
    "HPSv2": {
        "name": "HPSv2",
        "checkpoint_path": "xswu/HPSv2/HPS_v2.1_compressed.pt",
        "type": "hpsv2"
    },
    "ImageReward": {
        "name": "ImageReward v1.0",
        "checkpoint_path": "ImageReward-v1.0",
        "type": "imagereward"
    },
    "PickScore": {
        "name": "PickScore",
        "checkpoint_path": "yuvalkirstain/PickScore_v1",
        "type": "pickscore"
    },
    "CLIP": {
        "name": "CLIP ViT-H-14",
        "checkpoint_path": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
        "type": "clip"
    }
}

# --- Global Model Storage ---
current_models = {}
current_model_name = None

# --- Dynamic Model Loading Functions ---
def load_model(model_key, update_status_fn=None):
    """Load the specified model based on the model key."""
    global current_models, current_model_name
    
    if model_key == current_model_name and model_key in current_models:
        return current_models[model_key]
    
    if update_status_fn:
        update_status_fn(f"🔄 Loading {MODEL_CONFIGS[model_key]['name']}...")
    
    # Clear previous models to save memory
    current_models.clear()
    torch.cuda.empty_cache()
    
    config = MODEL_CONFIGS[model_key]
    
    try:
        if config["type"] == "hpsv3":
            checkpoint_path = huggingface_hub.hf_hub_download("MizzenAI/HPSv3", 'HPSv3.safetensors', repo_type='model')
            model = HPSv3RewardInferencer(
                device=DEVICE, 
                checkpoint_path=checkpoint_path
            )
        elif config["type"] == "hpsv2":
            model_obj, preprocess_train, preprocess_val = create_model_and_transforms(
                'ViT-H-14',
                'laion2B-s32B-b79K',
                precision='amp',
                device=DEVICE,
                jit=False,
                force_quick_gelu=False,
                force_custom_text=False,
                force_patch_dropout=False,
                force_image_size=None,
                pretrained_image=False,
                image_mean=None,
                image_std=None,
                light_augmentation=True,
                aug_cfg={},
                output_dict=True,
                with_score_predictor=False,
                with_region_predictor=False
            )
            checkpoint_path = huggingface_hub.hf_hub_download("xswu/HPSv2", 'HPS_v2.1_compressed.pt', repo_type='model')
            checkpoint = torch.load(checkpoint_path, map_location=DEVICE, weights_only=False)
            model_obj.load_state_dict(checkpoint['state_dict'])
            model_obj = model_obj.to(DEVICE).eval()
            tokenizer = get_tokenizer('ViT-H-14')
            model = {"model": model_obj, "preprocess_val": preprocess_val, "tokenizer": tokenizer}
        elif config["type"] == "imagereward":
            model = RM.load(config["checkpoint_path"])
        elif config["type"] == "pickscore":
            processor = AutoProcessor.from_pretrained('/preflab/models/CLIP-ViT-H-14-laion2B-s32B-b79K')
            model_obj = AutoModel.from_pretrained(config["checkpoint_path"]).eval().to(DEVICE)
            model = {"model": model_obj, "processor": processor}
        elif config["type"] == "clip":
            model_obj = AutoModel.from_pretrained(config["checkpoint_path"]).to(DEVICE)
            processor = AutoProcessor.from_pretrained(config["checkpoint_path"])
            model = {"model": model_obj, "processor": processor}
        else:
            raise ValueError(f"Unknown model type: {config['type']}")
        
        current_models[model_key] = model
        current_model_name = model_key
        
        if update_status_fn:
            update_status_fn(f"✅ {MODEL_CONFIGS[model_key]['name']} loaded successfully!")
        
        return model
    except Exception as e:
        error_msg = f"Error loading model {model_key}: {e}"
        print(error_msg)
        if update_status_fn:
            update_status_fn(f"❌ {error_msg}")
        return None

def score_with_model(model_key, image_paths, prompts):
    """Score images using the specified model."""
    model = load_model(model_key)
    if model is None:
        raise ValueError(f"Failed to load model {model_key}")
    
    config = MODEL_CONFIGS[model_key]
    
    if config["type"] == "hpsv3":
        rewards = model.reward(image_paths, prompts)
        return [reward[0].item() for reward in rewards]  # HPSv3 returns tensor with multiple values, take first
    elif config["type"] == "hpsv2":
        return score_hpsv2_batch(model, image_paths, prompts)
    elif config["type"] == "imagereward":
        return [model.score(prompt, image_path) for prompt, image_path in zip(prompts, image_paths)]
    elif config["type"] == "pickscore":
        return score_pickscore_batch(prompts, image_paths, model["model"], model["processor"])
    elif config["type"] == "clip":
        return score_clip_batch(model["model"], model["processor"], image_paths, prompts)
    else:
        raise ValueError(f"Unknown model type: {config['type']}")

def score_hpsv2_batch(model_dict, image_paths, prompts):
    """Score using HPSv2 model."""
    model = model_dict['model']
    preprocess_val = model_dict['preprocess_val']
    tokenizer = model_dict['tokenizer']

    # 批量处理图片
    images = [preprocess_val(Image.open(p)).unsqueeze(0)[:,:3,:,:] for p in image_paths]
    images = torch.cat(images, dim=0).to(device=DEVICE)
    texts = tokenizer(prompts).to(device=DEVICE)
    with torch.no_grad():
        outputs = model(images, texts)
        image_features, text_features = outputs["image_features"], outputs["text_features"]
        logits_per_image = image_features @ text_features.T
        hps_scores = torch.diagonal(logits_per_image).cpu()
    return [score.item() for score in hps_scores]

def score_pickscore_batch(prompts, image_paths, model, processor):
    """Score using PickScore model."""
    pil_images = [Image.open(p) for p in image_paths]
    image_inputs = processor(
        images=pil_images,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(DEVICE)
    
    text_inputs = processor(
        text=prompts,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(DEVICE)

    with torch.no_grad():
        image_embs = model.get_image_features(**image_inputs)
        image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
        text_embs = model.get_text_features(**text_inputs)
        text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
        scores = model.logit_scale.exp() * (text_embs @ image_embs.T)
        return [scores[i, i].cpu().item() for i in range(len(prompts))]

def score_clip_batch(model, processor, image_paths, prompts):
    """Score using CLIP model."""
    pil_images = [Image.open(p) for p in image_paths]
    image_inputs = processor(
        images=pil_images,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(DEVICE)
    
    text_inputs = processor(
        text=prompts,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(DEVICE)

    with torch.no_grad():
        image_embs = model.get_image_features(**image_inputs)
        image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
        text_embs = model.get_text_features(**text_inputs)
        text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
        scores = image_embs @ text_embs.T
        return [scores[i, i].cpu().item() for i in range(len(prompts))]

# Load default model
print("Loading default HPSv3 model...")
load_model("HPSv3_7B")
print("Model loaded successfully.")

# --- Helper Functions ---
def get_score_interpretation(score):
    """Returns a color-coded qualitative interpretation of the score."""
    if score is None:
        return ""
    
    if score < 0:
        color = "#ef4444"  # Modern red
        bg_color = "rgba(239, 68, 68, 0.1)"
        icon = "❌"
        feedback = "Poor Quality"
        comment = "The image has significant quality issues or doesn't match the prompt well."
    elif score < 5:
        color = "#f59e0b"  # Modern amber
        bg_color = "rgba(245, 158, 11, 0.1)"
        icon = "⚠️"
        feedback = "Needs Improvement"
        comment = "The image is acceptable but could be enhanced in quality or prompt alignment."
    elif score < 10:
        color = "#10b981"  # Modern emerald
        bg_color = "rgba(16, 185, 129, 0.1)"
        icon = "✅"
        feedback = "Good Quality"
        comment = "A well-crafted image that aligns nicely with the given prompt."
    else:  # score >= 10
        color = "#06d6a0"  # Vibrant teal
        bg_color = "rgba(6, 214, 160, 0.1)"
        icon = "⭐"
        feedback = "Excellent!"
        comment = "Outstanding quality and perfect alignment with the prompt."
    
    return f"""
    <div style='
        background: {bg_color};
        border: 2px solid {color};
        border-radius: 16px;
        padding: 20px;
        text-align: center;
        margin: 10px 0;
    '>
        <div style='font-size: 2rem; margin-bottom: 8px;'>{icon}</div>
        <h3 style='color: {color}; font-size: 1.4rem; font-weight: 700; margin: 8px 0;'>{feedback}</h3>
        <p style='color: #666; font-size: 0.95rem; margin: 0; line-height: 1.4;'>{comment}</p>
    </div>
    """

# --- Model Change Handler ---
def handle_model_change(model_key):
    """Handle model selection change."""
    global current_model_name
    
    if model_key != current_model_name:
        # Show loading status
        yield f"🔄 Loading {MODEL_CONFIGS[model_key]['name']}..."
        
        # Load the new model
        model = load_model(model_key)
        
        if model is not None:
            yield f"✅ Current model: {MODEL_CONFIGS[model_key]['name']}"
        else:
            yield f"❌ Failed to load {MODEL_CONFIGS[model_key]['name']}"
    else:
        yield f"✅ Current model: {MODEL_CONFIGS[model_key]['name']}"

# --- Prediction Function ---
def predict_score(image, prompt, model_name):
    """Takes Gradio inputs and returns the score, interpretation, and status."""
    if image is None:
        return None, "", "❌ Error: Please upload an image."
    if not prompt or not prompt.strip():
        return None, "", "❌ Error: Please enter a prompt."

    temp_dir = "temp_images_for_gradio"
    os.makedirs(temp_dir, exist_ok=True)
    temp_path = os.path.join(temp_dir, f"{uuid.uuid4()}.png")
    
    try:
        Image.fromarray(image).save(temp_path)
        scores = score_with_model(model_name, [temp_path], [prompt])
        score = round(scores[0], 4)
        interpretation = get_score_interpretation(score)
        return score, interpretation, "✅ Analysis completed successfully!"
    except Exception as e:
        print(f"An error occurred during inference: {e}")
        return None, "", f"❌ Processing error: {e}"
    finally:
        if os.path.exists(temp_path):
            os.remove(temp_path)

# --- Image Comparison Function ---
def compare_images(image1, image2, prompt, model_name):
    """Compare two images and determine which one is better based on the prompt."""
    if image1 is None or image2 is None:
        return None, None, "", "❌ Error: Please upload both images."
    if not prompt or not prompt.strip():
        return None, None, "", "❌ Error: Please enter a prompt."

    temp_dir = "temp_images_for_gradio"
    os.makedirs(temp_dir, exist_ok=True)
    temp_path1 = os.path.join(temp_dir, f"{uuid.uuid4()}_img1.png")
    temp_path2 = os.path.join(temp_dir, f"{uuid.uuid4()}_img2.png")
    
    try:
        Image.fromarray(image1).save(temp_path1)
        Image.fromarray(image2).save(temp_path2)
        
        # Get scores for both images
        scores = score_with_model(model_name, [temp_path1, temp_path2], [prompt, prompt])
        score1 = round(scores[0], 4)
        score2 = round(scores[1], 4)
        
        # Determine winner
        if score1 > score2:
            winner_text = f"🏆 **Image 1 is better!**\n\nImage 1 Score: **{score1}**\nImage 2 Score: **{score2}**\n\nDifference: **+{round(score1-score2, 4)}**"
        elif score2 > score1:
            winner_text = f"🏆 **Image 2 is better!**\n\nImage 1 Score: **{score1}**\nImage 2 Score: **{score2}**\n\nDifference: **+{round(score2-score1, 4)}**"
        else:
            winner_text = f"🤝 **It's a tie!**\n\nBoth images scored: **{score1}**"
        
        return score1, score2, winner_text, "✅ Comparison completed successfully!"
        
    except Exception as e:
        print(f"An error occurred during comparison: {e}")
        return None, None, "", f"❌ Processing error: {e}"
    finally:
        if os.path.exists(temp_path1):
            os.remove(temp_path1)
        if os.path.exists(temp_path2):
            os.remove(temp_path2)

# --- Gradio Interface ---
with gr.Blocks(theme=gr.themes.Soft(), title="HPSv3 - Human Preference Score v3") as demo:
    gr.HTML(f"""
    <div style="text-align: center; margin-bottom: 20px;">
        <h1>🎨 HPSv3: Human Preference Score v3</h1>
        <p>Evaluate image quality and alignment with prompts with multiple models.</p>
        <p><a href="https://mizzenai.github.io/HPSv3.project/" target="_blank">🌐 Project Website</a> | 
            <a href="https://huggingface.co/papers/2508.03789" target="_blank">📄 Paper</a> | 
            <a href="https://github.com/MizzenAI/HPSv3" target="_blank">💻 Code</a></p>
    </div>
    """)
    
    # Global model selector
    with gr.Row():
        model_selector = gr.Dropdown(
            choices=[(config["name"], key) for key, config in MODEL_CONFIGS.items()],
            value="HPSv3_7B",
            label="🤖 Select Model",
        )
        model_status = gr.Textbox(
            label="Model Status",
            value=f"✅ Current model: {MODEL_CONFIGS['HPSv3_7B']['name']}",
            interactive=False,
            scale=2
        )
    
    with gr.Tabs():
        # Tab 1: Single Image Scoring
        with gr.TabItem("📊 Image Scoring"):
            with gr.Row(equal_height=False):
                with gr.Column(scale=2):
                    with gr.Group():
                        gr.Markdown("### 🖼️ **Upload & Describe**")
                        image_input = gr.Image(
                            type="numpy", 
                            label="Upload Image", 
                            height=450
                        )
                        prompt_input = gr.Textbox(
                            label="Prompt Description", 
                            placeholder="Describe what the image should represent...",
                            lines=3,
                            max_lines=5
                        )
                
                with gr.Column(scale=1):
                    with gr.Group():
                        gr.Markdown("### 🎯 **Quality Assessment**")
                        score_output = gr.Number(
                            label="Score", 
                            elem_id="score-output",
                            precision=4
                        )
                        interpretation_output = gr.Markdown(label="")
                        status_output = gr.Textbox(
                            label="Status", 
                            interactive=False
                        )
            submit_button = gr.Button(
                "🚀 Run Evaluation", 
                variant="primary",
                size="lg"
            )
            
            submit_button.click(
                fn=predict_score,
                inputs=[image_input, prompt_input, model_selector],
                outputs=[score_output, interpretation_output, status_output]
            )

            with gr.Group():
                gr.Examples(
                    examples=[
                        ["assets/example1.png", "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker, high resolution, vibrant colors"],
                        ["assets/example2.png", "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker, high resolution, vibrant colors"],
                    ],
                    inputs=[image_input, prompt_input],
                    outputs=[score_output, interpretation_output, status_output],
                    fn=lambda img, prompt: predict_score(img, prompt, "HPSv3_7B"),
                    cache_examples=False
                )
        
        # Tab 2: Image Comparison
        with gr.TabItem("⚖️ Image Comparison"):
            with gr.Row(equal_height=False):
                with gr.Column(scale=2):
                    with gr.Group():
                        gr.Markdown("### 🖼️ **Upload Images & Prompt**")
                        with gr.Row():
                            image1_input = gr.Image(
                                type="numpy", 
                                label="Image 1", 
                                height=300
                            )
                            image2_input = gr.Image(
                                type="numpy", 
                                label="Image 2", 
                                height=300
                            )
                        prompt_compare_input = gr.Textbox(
                            label="Prompt Description", 
                            placeholder="Describe what the images should represent...",
                            lines=3,
                            max_lines=5
                        )
                
                with gr.Column(scale=1):
                    with gr.Group():
                        gr.Markdown("### 🎯 **Comparison Results**")
                        score1_output = gr.Number(
                            label="Image 1 Score", 
                            precision=4
                        )
                        score2_output = gr.Number(
                            label="Image 2 Score", 
                            precision=4
                        )
                        comparison_result = gr.Markdown(label="Winner")
                        status_compare_output = gr.Textbox(
                            label="Status", 
                            interactive=False
                        )
            
            compare_button = gr.Button(
                "⚖️ Compare Images", 
                variant="primary",
                size="lg"
            )
            
            compare_button.click(
                fn=compare_images,
                inputs=[image1_input, image2_input, prompt_compare_input, model_selector],
                outputs=[score1_output, score2_output, comparison_result, status_compare_output]
            )

            with gr.Group():
                gr.Examples(
                    examples=[
                        ["assets/example1.png", "assets/example2.png", "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker, high resolution, vibrant colors"],
                        ["assets/example2.png", "assets/example1.png", "cute chibi anime cartoon fox, smiling wagging tail with a small cartoon heart above sticker, high resolution, vibrant colors"],
                    ],
                    inputs=[image1_input, image2_input, prompt_compare_input],
                    outputs=[score1_output, score2_output, comparison_result, status_compare_output],
                    fn=lambda img1, img2, prompt: compare_images(img1, img2, prompt, "HPSv3_7B"),
                    cache_examples=False
                )

    # Model change handler
    model_selector.change(
        fn=handle_model_change,
        inputs=[model_selector],
        outputs=[model_status]
    )

def main():
    """Main function to launch the demo."""
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        favicon_path=None,
        show_error=True,
    )

if __name__ == "__main__":
    main()