Spaces:
Running
Running
| import os | |
| # this is .py for store constants | |
| MODEL_INFO = [ | |
| "Model Name (clickable)", | |
| "Sampled by", | |
| "Evaluated by", | |
| "Accessibility", | |
| "Date", | |
| "Total Score", | |
| "Quality Score", | |
| "Semantic Score", | |
| "Selected Score", | |
| ] | |
| MODEL_INFO_TAB_QUALITY = [ | |
| "Model Name (clickable)", | |
| "Quality Score", | |
| "Selected Score" | |
| ] | |
| MODEL_INFO_TAB_I2V = [ | |
| "Model Name (clickable)", | |
| "Sampled by", | |
| "Evaluated by", | |
| "Accessibility", | |
| "Date", | |
| "Total Score", | |
| "I2V Score", | |
| "Quality Score", | |
| "Selected Score" | |
| ] | |
| TASK_INFO = [ | |
| "subject consistency", | |
| "background consistency", | |
| "temporal flickering", | |
| "motion smoothness", | |
| "dynamic degree", | |
| "aesthetic quality", | |
| "imaging quality", | |
| "object class", | |
| "multiple objects", | |
| "human action", | |
| "color", | |
| "spatial relationship", | |
| "scene", | |
| "appearance style", | |
| "temporal style", | |
| "overall consistency" | |
| ] | |
| DEFAULT_INFO = [ | |
| "subject consistency", | |
| "background consistency", | |
| "temporal flickering", | |
| "motion smoothness", | |
| "dynamic degree", | |
| "aesthetic quality", | |
| "imaging quality", | |
| "object class", | |
| "multiple objects", | |
| "human action", | |
| "color", | |
| "spatial relationship", | |
| "scene", | |
| "appearance style", | |
| "temporal style", | |
| "overall consistency" | |
| ] | |
| QUALITY_LIST = [ | |
| "subject consistency", | |
| "background consistency", | |
| "temporal flickering", | |
| "motion smoothness", | |
| "aesthetic quality", | |
| "imaging quality", | |
| "dynamic degree",] | |
| SEMANTIC_LIST = [ | |
| "object class", | |
| "multiple objects", | |
| "human action", | |
| "color", | |
| "spatial relationship", | |
| "scene", | |
| "appearance style", | |
| "temporal style", | |
| "overall consistency" | |
| ] | |
| QUALITY_TAB = [ | |
| "subject consistency", | |
| "background consistency", | |
| "motion smoothness", | |
| "aesthetic quality", | |
| "imaging quality", | |
| "dynamic degree",] | |
| I2V_LIST = [ | |
| "Video-Text Camera Motion", | |
| "Video-Image Subject Consistency", | |
| "Video-Image Background Consistency", | |
| ] | |
| I2V_QUALITY_LIST = [ | |
| "Subject Consistency", | |
| "Background Consistency", | |
| "Motion Smoothness", | |
| "Dynamic Degree", | |
| "Aesthetic Quality", | |
| "Imaging Quality", | |
| # "Temporal Flickering" | |
| ] | |
| I2V_TAB = [ | |
| "Video-Text Camera Motion", | |
| "Video-Image Subject Consistency", | |
| "Video-Image Background Consistency", | |
| "Subject Consistency", | |
| "Background Consistency", | |
| "Motion Smoothness", | |
| "Dynamic Degree", | |
| "Aesthetic Quality", | |
| "Imaging Quality", | |
| # "Temporal Flickering" | |
| ] | |
| DIM_WEIGHT = { | |
| "subject consistency":1, | |
| "background consistency":1, | |
| "temporal flickering":1, | |
| "motion smoothness":1, | |
| "aesthetic quality":1, | |
| "imaging quality":1, | |
| "dynamic degree":0.5, | |
| "object class":1, | |
| "multiple objects":1, | |
| "human action":1, | |
| "color":1, | |
| "spatial relationship":1, | |
| "scene":1, | |
| "appearance style":1, | |
| "temporal style":1, | |
| "overall consistency":1 | |
| } | |
| DIM_WEIGHT_I2V = { | |
| "Video-Text Camera Motion": 0.1, | |
| "Video-Image Subject Consistency": 1, | |
| "Video-Image Background Consistency": 1, | |
| "Subject Consistency": 1, | |
| "Background Consistency": 1, | |
| "Motion Smoothness": 1, | |
| "Dynamic Degree": 0.5, | |
| "Aesthetic Quality": 1, | |
| "Imaging Quality": 1, | |
| "Temporal Flickering": 1 | |
| } | |
| SEMANTIC_WEIGHT = 1 | |
| QUALITY_WEIGHT = 4 | |
| I2V_WEIGHT = 1.0 | |
| I2V_QUALITY_WEIGHT = 1.0 | |
| DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] | |
| I2V_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] | |
| SUBMISSION_NAME = "vstar_leaderboard_submission" | |
| SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/V-STaR-Bench", SUBMISSION_NAME) | |
| CSV_DIR = "./vstar_leaderboard_submission/results.csv" | |
| QUALITY_DIR = "./vstar_leaderboard_submission/quality.csv" | |
| I2V_DIR = "./vstar_leaderboard_submission/i2v_results.csv" | |
| LONG_DIR = "./vstar_leaderboard_submission/long_debug.csv" | |
| INFO_DIR = "./vstar_leaderboard_submission/model_info.csv" | |
| COLUMN_NAMES = MODEL_INFO + TASK_INFO | |
| COLUMN_NAMES_QUALITY = MODEL_INFO_TAB_QUALITY + QUALITY_TAB | |
| COLUMN_NAMES_I2V = MODEL_INFO_TAB_I2V + I2V_TAB | |
| LEADERBORAD_INTRODUCTION = """# V-STaR Leaderboard | |
| *"Can Video-LLMs “reason through a sequential spatio-temporal logic” in videos?"* | |
| 🏆 Welcome to the leaderboard of the **V-STaR**! 🎦 *A spatio-temporal reasoning benchmark for Video-LLMs* [](https://github.com/V-STaR-Bench/V-STaR) | |
| <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;"> | |
| <a href=''><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a> | |
| <a href='https://v-star-bench.github.io/'><img src='https://img.shields.io/badge/VBench-Website-green?logo=googlechrome&logoColor=green'></a> | |
| </div> | |
| - **Comprehensive Dimensions:** We evaluate Video-LLM’s spatio-temporal reasoning ability in answering questions explicitly in the context of “when”, “where”, and “what”. | |
| - **Human Alignment:** We conducted extensive experiments and human annotations to validate robustness of V-STaR. | |
| - **New Metrics:** We proposed to use Arithmetic Mean (AM) and modified logarithmic Geometric Mean (LGM) to measure the spatio-temporal reasoning capability of Video-LLMs. We calculate AM and LGM from the "Accuracy" of VQA, "m_tIoU" of Temporal grounding and "m_vIoU" of Spatial Grounding, and we get the mean AM (mAM) and mean LGM (mLGM) from the results of our proposed 2 RSTR question chains. | |
| - **Valuable Insights:** V-STaR reveals a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning. | |
| **Join Leaderboard**: Please contact us to update your results. | |
| **Credits**: This leaderboard is updated and maintained by the team of [V-STaR Contributors](). | |
| """ | |
| SUBMIT_INTRODUCTION = """# Submit on V-STaR Benchmark Introduction | |
| ## 🎈 | |
| ⚠️ Please note that you need to obtain the file `results/*.json` by running V-STaR in Github. You may conduct an [Offline Eval](https://github.com/V-STaR-Bench/V-STaR) before submitting. | |
| ⚠️ Then, please contact us to update your results via [email1](mailto:zixu.cheng@qmul.ac.uk) or [email2](mailto:hu.jian@qmul.ac.uk). | |
| """ | |
| TABLE_INTRODUCTION = """ | |
| """ | |
| LEADERBORAD_INFO = """ | |
| V-STaR, a comprehensive spatio-temporal reasoning benchmark for video large language models (Video-LLMs). We construct a fine-grained reasoning dataset with coarse-to-fine CoT questions, enabling a structured evaluation of spatio-temporal reasoning. Specifically, we introduce a Reverse Spatio-Temporal Reasoning (RSTR) task to quantify models’ spatio-temporal reasoning ability. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. Experiments on V-STaR reveal although many models perform well on “what”, some struggle to ground their answers in time and location. This finding highlights a fundamental weakness in existing Video-LLMs regarding causal spatio-temporal reasoning and inspires research in improving trustworthy spatio-temporal understanding in future Video-LLMs. | |
| """ | |
| CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" | |
| CITATION_BUTTON_TEXT = r"""@misc{cheng2025vstarbenchmarkingvideollmsvideo, | |
| title={V-STaR: Benchmarking Video-LLMs on Video Spatio-Temporal Reasoning}, | |
| author={Zixu Cheng and Jian Hu and Ziquan Liu and Chenyang Si and Wei Li and Shaogang Gong}, | |
| year={2025}, | |
| eprint={2503.11495}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.CV}, | |
| url={https://arxiv.org/abs/2503.11495}, | |
| }""" | |
| QUALITY_CLAIM_TEXT = "We use all the videos on Sora website (https://openai.com/sora) for a preliminary evaluation, including the failure case videos Sora provided." | |
| I2V_CLAIM_TEXT = "Since the open-sourced SVD models do not accept text input during the I2V stage, we are unable to evaluate its `camera motion` in terms of `video-text consistency`. The total score is calculated based on all dimensions except `camera motion`." | |
| LONG_CLAIM_TEXT = "" | |
| NORMALIZE_DIC = { | |
| "subject consistency": {"Min": 0.1462, "Max": 1.0}, | |
| "background consistency": {"Min": 0.2615, "Max": 1.0}, | |
| "temporal flickering": {"Min": 0.6293, "Max": 1.0}, | |
| "motion smoothness": {"Min": 0.706, "Max": 0.9975}, | |
| "dynamic degree": {"Min": 0.0, "Max": 1.0}, | |
| "aesthetic quality": {"Min": 0.0, "Max": 1.0}, | |
| "imaging quality": {"Min": 0.0, "Max": 1.0}, | |
| "object class": {"Min": 0.0, "Max": 1.0}, | |
| "multiple objects": {"Min": 0.0, "Max": 1.0}, | |
| "human action": {"Min": 0.0, "Max": 1.0}, | |
| "color": {"Min": 0.0, "Max": 1.0}, | |
| "spatial relationship": {"Min": 0.0, "Max": 1.0}, | |
| "scene": {"Min": 0.0, "Max": 0.8222}, | |
| "appearance style": {"Min": 0.0009, "Max": 0.2855}, | |
| "temporal style": {"Min": 0.0, "Max": 0.364}, | |
| "overall consistency": {"Min": 0.0, "Max": 0.364} | |
| } | |
| NORMALIZE_DIC_I2V = { | |
| "Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 }, | |
| "Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0}, | |
| "Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 }, | |
| "Subject Consistency":{"Min": 0.1462, "Max": 1.0}, | |
| "Background Consistency":{"Min": 0.2615, "Max": 1.0 }, | |
| "Motion Smoothness":{"Min": 0.7060, "Max": 0.9975}, | |
| "Dynamic Degree":{"Min": 0.0, "Max": 1.0}, | |
| "Aesthetic Quality":{"Min": 0.0, "Max": 1.0}, | |
| "Imaging Quality":{"Min": 0.0, "Max": 1.0}, | |
| "Temporal Flickering":{"Min":0.6293, "Max": 1.0} | |
| } | |