Spaces:
Sleeping
Sleeping
| import os | |
| # this is .py for store constants | |
| MODEL_INFO_TAB_I2V = [ | |
| "Model Name", | |
| "Resolution", | |
| "Duration", | |
| "FPS", | |
| "Total Score", | |
| "I2V Score", | |
| "Quality Score", | |
| "Selected Score" | |
| ] | |
| MODEL_INFO_TAB_T2I = [ | |
| "Model Name", | |
| "Resolution", | |
| "Total Score", | |
| "Selected Score" | |
| ] | |
| MODEL_INFO_TAB_I2I = [ | |
| "Model Name", | |
| "Total Score", | |
| "Selected Score" | |
| ] | |
| MODEL_INFO_TAB_T2V = [ | |
| "Model Name", | |
| "Resolution", | |
| "Duration", | |
| "FPS", | |
| "Total Score", | |
| "Selected Score" | |
| ] | |
| DIM_INFO_DEFAULT_T2V = [ | |
| 'Creativity Score', | |
| 'Commonsense Score', | |
| 'Controllability Score', | |
| 'Human Fidelity Score', | |
| 'Physics Score', | |
| 'Text Rendering Score', | |
| 'Quality Score' | |
| ] | |
| MODEL_INFO_T2V = MODEL_INFO_TAB_T2V + DIM_INFO_DEFAULT_T2V | |
| TASK_INFO_T2V = [ | |
| "Human Anatomy", | |
| "Human Clothes", | |
| "Human Identity", | |
| "Composition", | |
| # "Diversity", | |
| "Mechanics", | |
| "Material", | |
| "Thermotics", | |
| "Multi-View Consistency", | |
| "Dynamic Spatial Relationship", | |
| "Dynamic Attribute", | |
| "Motion Order Understanding", | |
| "Human Interaction", | |
| "Complex Landscape", | |
| "Complex Plot", | |
| "Camera Motion", | |
| "Motion Rationality", | |
| "Instance Preservation", | |
| "OCR Score", | |
| "Text Attribute", | |
| "Dynamic Text", | |
| "Subject Consistency", | |
| "Background Consistency", | |
| "Motion Smoothness", | |
| "Dynamic Degree", | |
| "Aesthetic Quality", | |
| "Imaging Quality", | |
| "Temporal Flickering" | |
| ] | |
| I2V_LIST = [ | |
| # "Video-Text Camera Motion", | |
| "Video-Image Subject Consistency", | |
| "Video-Image Background Consistency", | |
| ] | |
| I2V_QUALITY_LIST = [ | |
| "Subject Consistency", | |
| "Background Consistency", | |
| "Motion Smoothness", | |
| "Dynamic Degree", | |
| "Aesthetic Quality", | |
| "Imaging Quality", | |
| "Temporal Flickering" | |
| ] | |
| I2V_TAB = [ | |
| # "Video-Text Camera Motion", | |
| "Video-Image Subject Consistency", | |
| "Video-Image Background Consistency", | |
| "Subject Consistency", | |
| "Background Consistency", | |
| "Motion Smoothness", | |
| "Dynamic Degree", | |
| "Aesthetic Quality", | |
| "Imaging Quality", | |
| "Temporal Flickering" | |
| ] | |
| T2I_TAB = [ | |
| "Aesthetic", | |
| "Image Quality", | |
| "Prompt Semantic Alignment", | |
| "Text Rendering", | |
| ] | |
| I2I_TAB_DICT = { | |
| 'Face Reference Generation': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Reference', 'Face_Ref'], | |
| 'Subject Reference Generation': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Reference', 'DINO_Ref'], | |
| 'Style Reference Generation': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Reference', 'Style_Ref'], | |
| 'Color Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Face Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Style Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Texture Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Motion Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Scene Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Subject Addition': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Subject Change': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Subject Removal': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Text Removal': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Text Render': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Composite Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Inpainting': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Outpainting': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Local Subject Addition': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Local Subject Removal': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Local Text Render': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Local Text Removal': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA'], | |
| 'Virtual Try On': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA', 'DINO_Ref'], | |
| 'Face Swap': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA', 'Face_Ref'], | |
| 'Subject-guided Inpainting': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA', 'DINO_Ref'], | |
| 'Style Reference Editing': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'CLIP_Source', 'L1_Raw', 'VLLM_QA', 'Style_Ref'], | |
| 'Pose-guided Generation': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'L1_Pose'], | |
| 'Depth-guided Generation': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'L1_Depth'], | |
| 'Edge-guided Generation': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'L1_Canny'], | |
| 'Image Deblur': ['Aesthetic', 'Imaging', 'SSIM'], | |
| 'Image Colorize': ['Aesthetic', 'Imaging', 'CLIP_Caption', 'L1_Colorize', 'Colorfulness'] | |
| } | |
| DIM_KEY_TO_COL_I2I = { | |
| "aesthetic": "Aesthetic", | |
| "imaging": "Imaging", | |
| "clip_cap": "CLIP_Caption", | |
| "clip_src": "CLIP_Source", | |
| "clip_ref": "CLIP_Reference", | |
| "dino_ref": "DINO_Ref", | |
| "style_ref": "Style_Ref", | |
| "face_ref": "Face_Ref", | |
| "l1_raw": "L1_Raw", | |
| "l1_pose": "L1_Pose", | |
| "l1_depth": "L1_Depth", | |
| "l1_canny": "L1_Canny", | |
| "l1_colorize": "L1_Colorize", | |
| "ssim": "SSIM", | |
| "colorfulness": "Colorfulness", | |
| "vllmqa": "VLLM_QA" | |
| } | |
| TASK_INFO_I2I = { | |
| 'Controllable Generation': ['Image Colorize','Image Deblur','Pose-guided Generation','Depth-guided Generation','Edge-guided Generation'], | |
| 'Global Editing': ['Color Editing','Face Editing','Motion Editing','Texture Editing','Subject Addition','Subject Removal','Subject Change','Style Editing','Scene Editing','Text Render','Text Removal','Composite Editing'], | |
| 'Local Editing': ['Inpainting','Outpainting','Local Subject Addition','Local Subject Removal','Local Text Render','Local Text Removal'], | |
| 'Reference Generation': ['Face Reference Generation','Subject Reference Generation','Style Reference Generation'], | |
| 'Reference Editing': ['Face Swap','Subject-guided Inpainting','Virtual Try On','Style Reference Editing'], | |
| } | |
| CATEGORY_I2I = [ | |
| 'Controllable Generation', | |
| 'Global Editing', | |
| 'Local Editing', | |
| 'Reference Generation', | |
| 'Reference Editing', | |
| ] | |
| TASK_I2I = [ | |
| 'Image Colorize', | |
| 'Image Deblur', | |
| 'Pose-guided Generation', | |
| 'Depth-guided Generation', | |
| 'Edge-guided Generation', | |
| 'Color Editing', | |
| 'Face Editing', | |
| 'Motion Editing', | |
| 'Texture Editing', | |
| 'Subject Addition', | |
| 'Subject Removal', | |
| 'Subject Change', | |
| 'Style Editing', | |
| 'Scene Editing', | |
| 'Text Render', | |
| 'Text Removal', | |
| 'Composite Editing', | |
| 'Inpainting', | |
| 'Outpainting', | |
| 'Local Subject Addition', | |
| 'Local Subject Removal', | |
| 'Local Text Render', | |
| 'Local Text Removal', | |
| 'Face Reference Generation', | |
| 'Subject Reference Generation', | |
| 'Style Reference Generation', | |
| 'Virtual Try On', | |
| 'Face Swap', | |
| 'Subject-guided Inpainting', | |
| 'Style Reference Editing', | |
| ] | |
| T2I_Subject_TAB = [ | |
| "Cultural knowledge", | |
| "Physical Knowledge", | |
| "Chemistry", | |
| "people: portraits", | |
| "people: groups-and-activities", | |
| "nature-and-landscapes", | |
| "general-and-photorealistic", | |
| "physical-spaces", | |
| "Food and Drink", | |
| "Animals", | |
| "style", | |
| "3D rendering", | |
| "fantasy-and-mythical", | |
| "cartoon-and-illustration", | |
| "vintage-and-retro", | |
| "futuristic-and-sci-fi", | |
| "graphic-design-and-digital-rendering", | |
| "ui-ux-design", | |
| "text-and-typography", | |
| "text-and-typography-cn", | |
| "2d_spatial_relation", | |
| "3d_spatial_relation", | |
| "numeracy", | |
| "shape", | |
| "color", | |
| ] | |
| T2I_SHOW_SUBJECT_TAB = [ | |
| "🌐 All", # 全部(不过滤) | |
| "🧊 3D Rendering", | |
| "🐉 Fantasy & Mythical", | |
| "🖍️ Cartoon & Illustration", | |
| "📻 Vintage & Retro", | |
| "🚀 Futuristic & Sci-Fi", | |
| "🎨 Artistic Style", | |
| "🧩 Graphic & Digital Design", | |
| "🖥️ UI / UX Design", | |
| "🔤 Text & Typography (EN)", | |
| "🔤 Text & Typography (CN)", | |
| "🧍 Portraits", | |
| "👥 Groups & Activities", | |
| "🌄 Nature & Landscapes", | |
| "📸 Photorealistic Scenes", | |
| "🏠 Physical Spaces", | |
| "🍔 Food & Drink", | |
| "🐾 Animals", | |
| "🏛️ Cultural Knowledge", | |
| "⚙️ Physical Knowledge", | |
| "🧪 Chemistry", | |
| "📐 2D Spatial Relations", | |
| "🧭 3D Spatial Relations", | |
| "🔢 Numeracy", | |
| "🔺 Shape", | |
| "🎨 Color", | |
| ] | |
| T2I_SUBJECT_REVERSE_MAP = { | |
| # 全部(不过滤) | |
| "🌐 All": None, | |
| # Style & Rendering | |
| "🧊 3D Rendering": "3D rendering", | |
| "🐉 Fantasy & Mythical": "fantasy-and-mythical", | |
| "🖍️ Cartoon & Illustration": "cartoon-and-illustration", | |
| "📻 Vintage & Retro": "vintage-and-retro", | |
| "🚀 Futuristic & Sci-Fi": "futuristic-and-sci-fi", | |
| "🎨 Artistic Style": "style", | |
| # Design | |
| "🧩 Graphic & Digital Design": "graphic-design-and-digital-rendering", | |
| "🖥️ UI / UX Design": "ui-ux-design", | |
| "🔤 Text & Typography (EN)": "text-and-typography", | |
| "🔤 Text & Typography (CN)": "text-and-typography-cn", | |
| # People | |
| "🧍 Portraits": "people: portraits", | |
| "👥 Groups & Activities": "people: groups-and-activities", | |
| # World & Nature | |
| "🌄 Nature & Landscapes": "nature-and-landscapes", | |
| "📸 Photorealistic Scenes": "general-and-photorealistic", | |
| "🏠 Physical Spaces": "physical-spaces", | |
| "🍔 Food & Drink": "Food and Drink", | |
| "🐾 Animals": "Animals", | |
| # Knowledge & Reasoning | |
| "🏛️ Cultural Knowledge": "Cultural knowledge", | |
| "⚙️ Physical Knowledge": "Physical Knowledge", | |
| "🧪 Chemistry": "Chemistry", | |
| # Spatial & Attributes | |
| "📐 2D Spatial Relations": "2d_spatial_relation", | |
| "🧭 3D Spatial Relations": "3d_spatial_relation", | |
| "🔢 Numeracy": "numeracy", | |
| "🔺 Shape": "shape", | |
| "🎨 Color": "color", | |
| } | |
| DIM_WEIGHT_I2V = { | |
| "Video-Text Camera Motion": 0.1, | |
| "Video-Image Subject Consistency": 1, | |
| "Video-Image Background Consistency": 1, | |
| "Subject Consistency": 1, | |
| "Background Consistency": 1, | |
| "Motion Smoothness": 1, | |
| "Dynamic Degree": 0.5, | |
| "Aesthetic Quality": 1, | |
| "Imaging Quality": 1, | |
| "Temporal Flickering": 1 | |
| } | |
| DIM_WEIGHT_QUALITY_T2V = { | |
| "Subject Consistency": 1, | |
| "Background Consistency": 1, | |
| "Motion Smoothness": 1, | |
| "Dynamic Degree": 0.5, | |
| "Aesthetic Quality": 1, | |
| "Imaging Quality": 1, | |
| "Temporal Flickering": 1 | |
| } | |
| DIM_WEIGHT_T2V = { | |
| "Camera Motion": 1, | |
| "Complex Landscape": 1, | |
| "Complex Plot": 1, | |
| "Composition": 1, | |
| "Dynamic Attribute": 1, | |
| "Dynamic Spatial Relationship": 1, | |
| "Human Anatomy": 1, | |
| "Human Clothes": 1, | |
| "Human Identity": 1, | |
| "Human Interaction": 1, | |
| "Instance Preservation": 1, | |
| "Material": 1, | |
| "Mechanics": 1, | |
| "Motion Order Understanding": 1, | |
| "Motion Rationality": 1, | |
| "Multi-View Consistency": 1, | |
| "Thermotics": 1, | |
| "Text Rendering": 1.5, | |
| "Quality": 0.5 | |
| } | |
| DIM_WEIGHT_T2I = { | |
| "Aesthetic": 1, | |
| "Image Quality": 1, | |
| "Prompt Semantic Alignment": 1, | |
| "Text Rendering": 1, | |
| } | |
| SHOW_DIM_WEIGHT_IMAGE = { | |
| "Aesthetic": 0.1, | |
| "Image Quality": 0.01, | |
| } | |
| SEMANTIC_WEIGHT = 1 | |
| QUALITY_WEIGHT = 4 | |
| I2V_WEIGHT = 1.0 | |
| I2V_QUALITY_WEIGHT = 1.0 | |
| T2V_TITLE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown'] + ['number'] * (len(TASK_INFO_T2V) + len(DIM_INFO_DEFAULT_T2V) + 2) | |
| I2V_TITLE_TYPE = ['markdown', "markdown", 'markdown', 'markdown'] + ['number'] * (len(I2V_TAB) + 4) | |
| T2I_TITLE_TYPE = ['markdown', "markdown", 'number', 'number', 'number', 'number', 'number', 'number'] | |
| I2I_TITLE_TYPE = ['markdown', 'number', 'number'] + ['number'] * len(TASK_I2I) | |
| SUBMISSION_NAME = "eval_results" | |
| SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/Liveme/", SUBMISSION_NAME) | |
| I2V_DIR = "./eval_results/eval_vbench_i2v_results_fix" | |
| T2V_DIR = "./eval_results/eval_vbench_t2v_results_fix" | |
| T2I_DIR = "./eval_results/eval_ibench_t2i_results_fix" | |
| I2I_DIR = "./eval_results/eval_ibench_i2i_results_fix" | |
| COLUMN_NAMES_T2V = MODEL_INFO_T2V + TASK_INFO_T2V | |
| COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB | |
| COLUMN_NAMES_T2I = MODEL_INFO_TAB_T2I + T2I_TAB | |
| COLUMN_NAMES_I2I = MODEL_INFO_TAB_I2I + TASK_I2I | |
| # I2I 数据类型 | |
| LEADERBORAD_INTRODUCTION = """# VBench/IBench Leaderboard | |
| *"Which Video/Image Generation Model is better?"* | |
| 🏆 Welcome to the leaderboard of the **VBench/IBench**! 🎦 *A Comprehensive Benchmark Suite for Video/Image Generative Models* | |
| Our evaluation of the model's video generation capabilities is based on VBench. | |
| <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;"> | |
| <b>VBench:</b> <a href='https://github.com/Vchitect/VBench'><img src='https://img.shields.io/badge/VBench-green?style=social&logo=github'></a> | |
| <a href='https://arxiv.org/abs/2311.17982'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a> | |
| <a href='https://vchitect.github.io/VBench-project/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a> | |
| <a href='https://pypi.org/project/vbench/'><img src='https://img.shields.io/pypi/v/vbench'></a> | |
| <a href='https://www.youtube.com/watch?v=7IhCC8Qqn8Y'><img src='https://img.shields.io/badge/YouTube-Video-c4302b?logo=youtube&logoColor=red'></a> | |
| </div> | |
| While our evaluation of the model's image generation capabilities is based on ICE-Bench. | |
| <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;"> | |
| <b>ICE-Bench:</b> <a href='https://github.com/ali-vilab/ICE-Bench'><img src='https://img.shields.io/badge/ICE_Bench-green?style=social&logo=github'></a> | |
| <a href='https://arxiv.org/pdf/2503.14482'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a> | |
| <a href='https://ali-vilab.github.io/ICE-Bench-Page/'><img src='https://img.shields.io/badge/ICE_Bench-Website-red?logo=googlechrome&logoColor=red'></a> | |
| </div> | |
| Taking into account both model sampling costs and evaluation accuracy, we optimized the original process as follows: | |
| - **Test Set Selection:** While covering all test points, we streamlined the evaluation set. This streamlining is effective, reducing sampling costs while still allowing for effective evaluation of the model's generation capabilities (with minimal deviation from the official leaderboard). | |
| - **Manual Review of Results:** We manually reviewed each sampled evaluation result, improving the accuracy of the evaluation and reducing illusions during the model evaluation process. | |
| - **Dimension Optimization:** We optimized the evaluation logic for certain dimensions, improving the corresponding accuracy. | |
| **Sampled results**: The sampled videos/images for the models in the leaderboard were obtained by calling the official API and are all real-world test data. What specific details does the model sample from the video/image? See [HERE](https://huggingface.co/datasets/Liveme/model_sample_results/tree/main/sample_results) | |
| """ | |
| TABLE_INTRODUCTION = """ | |
| """ | |
| LEADERBORAD_INFO = """ | |
| ## About this leaderboard | |
| This demo provides a unified view of **VBench 2.0** (video generation evaluation) and **ICE-Bench / IBench** (image generation & editing evaluation). | |
| Each tab corresponds to a task setting and reports multiple **dimensions**. Higher is better unless otherwise stated. | |
| --- | |
| ## Tab 1 — Text to Video (VBench 2.0) | |
| This tab evaluates **text-conditioned video generation** across fine-grained dimensions, and also summarizes them into category scores. | |
| ### Summary scores | |
| - **Total Score**: Weighted average over selected components (core dimensions, Text Rendering, and Quality). Core/Text Rendering use mean scores; Quality is normalized then weighted by per-dimension weights. Final score = sum(component×weight)/sum(weights). | |
| - **Creativity Score**: How well the model can produce **diverse, appealing, well-composed** videos beyond trivial or repetitive outputs. | |
| - **Commonsense Score**: Whether the generated content follows **commonsense constraints** (e.g., plausible actions and continuity). | |
| - **Controllability Score**: How accurately the model can follow prompts that require **control and precision**, including complex motion/scene requirements. | |
| - **Human Fidelity Score**: How realistically the model renders **humans** (identity, anatomy, clothing), avoiding artifacts and inconsistencies. | |
| - **Physics Score**: Whether the video obeys **physical plausibility** (materials, mechanics, thermotics, multi-view consistency). | |
| - **Text Rendering Score**: How well the model can generate and maintain **correct, readable, and temporally consistent text** in videos, including spelling accuracy, text attributes, and dynamic text behaviors. | |
| - **Quality Score**: Overall visual and temporal quality, aggregated from quality-related dimensions (e.g., consistency, smoothness, aesthetics, imaging quality, and flickering). | |
| ### Dimension definitions (shown as columns) | |
| - **Human Anatomy**: Correct human body structure and pose without obvious deformities. | |
| - **Human Clothes**: Clothing details, texture, and consistency over time. | |
| - **Human Identity**: Identity consistency of people across frames (faces, attributes, overall appearance). | |
| - **Composition**: Visual composition quality (layout, framing, aesthetics of the scene arrangement). | |
| - **Mechanics**: Plausibility of mechanical behaviors (motion, forces, interactions of objects). | |
| - **Material**: Realistic material appearance and response (e.g., reflectance, transparency, rigidity). | |
| - **Thermotics**: Thermal/energy-related plausibility (e.g., fire/smoke/heat effects behaving reasonably). | |
| - **Multi-View Consistency**: Consistency of objects/scenes under viewpoint changes or camera movement. | |
| - **Dynamic Spatial Relationship**: Correct spatial relations while objects move (distance, relative positions). | |
| - **Dynamic Attribute**: Correct evolution of dynamic attributes (size, state, pose changes that match the prompt). | |
| - **Motion Order Understanding**: Correct ordering of multi-step actions/events (cause → effect, step sequences). | |
| - **Human Interaction**: Plausible interactions between humans and other humans/objects. | |
| - **Complex Landscape**: Ability to render complex environments (many elements, depth, structure) coherently. | |
| - **Complex Plot**: Ability to follow multi-entity, multi-event prompt narratives with temporal coherence. | |
| - **Camera Motion**: Plausibility and stability of camera movement (pan/tilt/zoom/track) consistent with the scene. | |
| - **Motion Rationality**: Whether motions are physically and semantically reasonable (no jittery, impossible trajectories). | |
| - **Instance Preservation**: Identity and attribute preservation for key instances/objects throughout the video. | |
| - **OCR Score**: Text recognition-based score reflecting whether rendered text is **legible and correctly spelled** (higher indicates better readability and accuracy). | |
| - **Text Attribute**: Correctness of **text attributes** such as font style, size, color, orientation, and placement as required by the prompt. | |
| - **Dynamic Text**: Ability to render **dynamic in-video text whose content changes over time**, focusing on the **accuracy/correctness of the changing text** as it updates across frames. | |
| - **Subject Consistency**: Temporal consistency of the main subject throughout the video (reduces identity drift and sudden changes). | |
| - **Background Consistency**: Temporal consistency of the background (reduces collapsing structures and flickering scenery). | |
| - **Motion Smoothness**: Motion continuity across frames (reduces jitter, stutter, and abrupt motion). | |
| - **Dynamic Degree**: The amount of motion and dynamics (too static vs. appropriately animated). | |
| - **Aesthetic Quality**: Overall visual appeal (composition, pleasantness, and style quality). | |
| - **Imaging Quality**: Low-level image quality (sharpness, exposure, noise, and artifacts). | |
| - **Temporal Flickering**: Temporal stability of appearance (lower flicker = better temporal coherence). | |
| --- | |
| ## Tab 2 — Image to Video | |
| This tab evaluates **video generation conditioned on an input image** (I2V). It focuses on how well the video preserves the input image content while producing temporally coherent motion and good visual quality. | |
| ### Dimension definitions (shown as columns) | |
| - **Video-Image Subject Consistency**: The main subject in the video stays consistent with the input image (identity, shape, key attributes). | |
| - **Video-Image Background Consistency**: The background layout/style remains consistent with the input image when appropriate. | |
| - **Subject Consistency**: Temporal consistency of the subject within the generated video (no identity drift or sudden changes). | |
| - **Background Consistency**: Temporal consistency of the background (no collapsing structures or flickering scenery). | |
| - **Motion Smoothness**: Motion continuity across frames (reduces jitter, stutter, and abrupt motion). | |
| - **Dynamic Degree**: The amount of motion and dynamics (too static vs. appropriately animated). | |
| - **Aesthetic Quality**: Overall visual appeal (composition, pleasantness, style quality). | |
| - **Imaging Quality**: Low-level image quality (sharpness, exposure, noise, artifacts). | |
| - **Temporal Flickering**: Temporal stability of appearance (lower flicker = better temporal coherence). | |
| ### Notes on scores displayed in the table | |
| - **Total Score**: Weighted combination of quality-related and I2V-related dimensions. | |
| - **I2V Score**: The I2V consistency-focused component. | |
| - **Quality Score**: The visual/temporal quality-focused component. | |
| - **Selected Score**: Score computed from the currently selected dimensions in the UI. | |
| --- | |
| ## Tab 3 — Text to Image | |
| This tab evaluates **text-to-image generation** and reports four core dimensions. A **Subject** selector is provided to inspect performance on specific prompt groups. | |
| ### Dimension definitions (shown as columns) | |
| - **Aesthetic**: Perceptual attractiveness and overall artistic appeal of the image. | |
| - **Image Quality**: Technical fidelity such as sharpness, artifacts, noise, and overall realism. | |
| - **Prompt Semantic Alignment**: Semantic alignment between the prompt and the generated image measured via VQA. | |
| - **Text Rendering**: Quality of text rendering in the generated image, including spelling accuracy, text attributes, and visual integration. | |
| ### Subject filtering | |
| The **Subject** option narrows evaluation to a subset of prompts (e.g., style, people, knowledge, spatial relations) to analyze strengths/weaknesses by content type. | |
| --- | |
| ## Tab 4 — Image to Image | |
| This tab evaluates **image editing and conditional image generation** tasks. The table can be filtered by task categories (buttons) and by specific tasks (checkboxes). | |
| ### Task categories (buttons) | |
| - **Controllable Generation**: Generate an image under a strong control signal (e.g., depth/pose/edge guidance) while meeting prompt intent. | |
| - **Global Editing**: Edit the overall image content or style (large-region changes) while keeping coherence. | |
| - **Local Editing**: Edit specific regions (e.g., inpainting/outpainting, local additions/removals) without harming untouched areas. | |
| - **Reference Generation**: Generate images that match a reference attribute (e.g., face/subject/style identity). | |
| - **Reference Editing**: Edit an image to match a reference (swap/try-on/subject-guided edits) while preserving non-target content. | |
| ### Sub-dimension (task) definitions (shown as checkboxes) | |
| Below are the **30 I2I tasks** used as sub-dimensions. Each checkbox corresponds to one task, and the table reports the score on that task. | |
| #### Controllable Generation | |
| - **Image Colorize**: Colorize a grayscale/low-color image with plausible, consistent colors while matching the prompt. | |
| - **Image Deblur**: Restore sharp details from a blurred image while avoiding artifacts and preserving content. | |
| - **Pose-guided Generation**: Generate an image that follows a given human pose/control signal and aligns with the prompt. | |
| - **Depth-guided Generation**: Generate an image consistent with a provided depth map (geometry/layout) and the prompt. | |
| - **Edge-guided Generation**: Generate an image consistent with a provided edge map (structure/contours) and the prompt. | |
| #### Global Editing | |
| - **Color Editing**: Globally adjust color tone/lighting (e.g., warm/cool, day/night) while preserving structure and identity. | |
| - **Face Editing**: Edit facial attributes (e.g., age/expression) while keeping identity and avoiding unnatural artifacts. | |
| - **Motion Editing**: Edit motion-related or dynamic attributes described in the prompt while keeping the rest coherent. | |
| - **Texture Editing**: Modify textures/material appearance (e.g., fabric, wood, metal) without breaking shapes/edges. | |
| - **Subject Addition**: Add a new subject into the scene with correct perspective, lighting, and composition. | |
| - **Subject Removal**: Remove a target subject and plausibly inpaint the background. | |
| - **Subject Change**: Replace a subject with another while maintaining scene layout, lighting, and consistency. | |
| - **Style Editing**: Change the overall artistic style (e.g., watercolor, anime) while preserving semantic content. | |
| - **Scene Editing**: Change the scene environment/background (e.g., beach → city) while keeping key subjects consistent. | |
| - **Text Render**: Add/render text with correct spelling, placement, and visual integration. | |
| - **Text Removal**: Remove text/watermarks while keeping the underlying content natural. | |
| - **Composite Editing**: Perform multiple edits jointly (e.g., subject + style + color) with consistent results. | |
| #### Local Editing | |
| - **Inpainting**: Fill in a masked region with content consistent with the context and the prompt. | |
| - **Outpainting**: Extend the image beyond its original borders while keeping style and structure consistent. | |
| - **Local Subject Addition**: Add a subject in a specified local region while preserving the rest of the image. | |
| - **Local Subject Removal**: Remove a subject in a specified local region and reconstruct the occluded/background area. | |
| - **Local Text Render**: Render text in a specified local region with correct integration and legibility. | |
| - **Local Text Removal**: Remove text in a specified local region while preserving surrounding textures/structures. | |
| #### Reference Generation | |
| - **Face Reference Generation**: Generate an image that matches a provided face identity reference while following the prompt. | |
| - **Subject Reference Generation**: Generate an image that matches a provided subject reference (object/identity) and the prompt. | |
| - **Style Reference Generation**: Generate an image that matches a provided style reference while respecting the prompt content. | |
| #### Reference Editing | |
| - **Virtual Try On**: Dress a person with a referenced garment while keeping pose/body identity and realistic fit. | |
| - **Face Swap**: Swap faces between images while maintaining natural blending, lighting, and expression consistency. | |
| - **Subject-guided Inpainting**: Inpaint a region guided by a subject reference, inserting the referenced subject coherently. | |
| - **Style Reference Editing**: Edit an image to a target style given by a style reference while preserving key content. | |
| ### Interpreting the table | |
| - **Total Score**: Overall score across the full I2I task set. | |
| - **Selected Score**: Score across the currently selected tasks in the UI. | |
| - **Per-task scores**: Each task reflects success at the specific editing/generation objective under that setting. | |
| """ | |
| I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`." | |
| NORMALIZE_DIC = { | |
| "Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 }, | |
| "Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0}, | |
| "Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 }, | |
| "Subject Consistency":{"Min": 0.1462, "Max": 1.0}, | |
| "Background Consistency":{"Min": 0.2615, "Max": 1.0 }, | |
| "Motion Smoothness":{"Min": 0.7060, "Max": 0.9975}, | |
| "Dynamic Degree":{"Min": 0.0, "Max": 1.0}, | |
| "Aesthetic Quality":{"Min": 0.0, "Max": 1.0}, | |
| "Imaging Quality":{"Min": 0.0, "Max": 1.0}, | |
| "Temporal Flickering":{"Min":0.6293, "Max": 1.0} | |
| } | |
| DIM2CAT_T2V = { | |
| "Human Anatomy": "Human Fidelity", | |
| "Human Identity": "Human Fidelity", | |
| "Human Clothes": "Human Fidelity", | |
| # "Diversity": "Creativity", | |
| "Composition": "Creativity", | |
| "Dynamic Spatial Relationship": "Controllability", | |
| "Dynamic Attribute": "Controllability", | |
| "Motion Order Understanding": "Controllability", | |
| "Human Interaction": "Controllability", | |
| "Complex Landscape": "Controllability", | |
| "Complex Plot": "Controllability", | |
| "Camera Motion": "Controllability", | |
| "Motion Rationality": "Commonsense", | |
| "Instance Preservation": "Commonsense", | |
| "Mechanics": "Physics", | |
| "Thermotics": "Physics", | |
| "Material": "Physics", | |
| "Multi-View Consistency": "Physics", | |
| "OCR Score": "Text Rendering", | |
| "Text Attribute": "Text Rendering", | |
| "Dynamic Text": "Text Rendering", | |
| "Subject Consistency": "Quality", | |
| "Background Consistency": "Quality", | |
| "Motion Smoothness": "Quality", | |
| "Dynamic Degree": "Quality", | |
| "Temporal Flickering": "Quality", | |
| "Aesthetic Quality": "Quality", | |
| "Imaging Quality": "Quality" | |
| } | |
| DIM_KEY_TO_COL = { | |
| "Human_Anatomy": "Human Anatomy", | |
| "Human_Clothes": "Human Clothes", | |
| "Human_Identity": "Human Identity", | |
| "Composition": "Composition", | |
| "Mechanics": "Mechanics", | |
| "Material": "Material", | |
| "Thermotics": "Thermotics", | |
| "Multi-View_Consistency": "Multi-View Consistency", | |
| "Dynamic_Spatial_Relationship": "Dynamic Spatial Relationship", | |
| "Dynamic_Attribute": "Dynamic Attribute", | |
| "Motion_Order_Understanding": "Motion Order Understanding", | |
| "Human_Interaction": "Human Interaction", | |
| "Complex_Landscape": "Complex Landscape", | |
| "Complex_Plot": "Complex Plot", | |
| "Camera_Motion": "Camera Motion", | |
| "Motion_Rationality": "Motion Rationality", | |
| "Instance_Preservation": "Instance Preservation", | |
| } | |
| DIM_KEY_TO_COL_TEXT_RENDERING = { | |
| "OCR_Score": "OCR Score", | |
| "Text_Attribute": "Text Attribute", | |
| "Dynamic_Text": "Dynamic Text", | |
| } | |
| DIM_KEY_TO_COL_QUALITY = { | |
| "subject_consistency": "Subject Consistency", | |
| "background_consistency": "Background Consistency", | |
| "motion_smoothness": "Motion Smoothness", | |
| "dynamic_degree": "Dynamic Degree", | |
| "aesthetic_quality": "Aesthetic Quality", | |
| "imaging_quality": "Imaging Quality", | |
| "temporal_flickering": "Temporal Flickering", | |
| } | |
| FINAL_FIXED_KEY_TO_COL = { | |
| "Total_Score": "Total Score", | |
| "Creativity_Score": "Creativity Score", | |
| "Commonsense_Score": "Commonsense Score", | |
| "Controllability_Score": "Controllability Score", | |
| "Human_Fidelity_Score": "Human Fidelity Score", | |
| "Physics_Score": "Physics Score", | |
| "Text_Rendering_Score": "Text Rendering Score", | |
| "Quality_Score": "Quality Score", | |
| } |