Anas Awadalla
commited on
Commit
·
2dbb46e
1
Parent(s):
2a7516c
try streaming
Browse files- README.md +13 -5
- src/streamlit_app.py +64 -20
README.md
CHANGED
|
@@ -17,7 +17,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
|
|
| 17 |
|
| 18 |
## Features
|
| 19 |
|
| 20 |
-
- **Real-time Data**:
|
| 21 |
- **Interactive Visualizations**: Bar charts comparing model performance across different metrics
|
| 22 |
- **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
|
| 23 |
- **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
|
|
@@ -25,7 +25,7 @@ A Streamlit application for visualizing model performance on grounding benchmark
|
|
| 25 |
- Text vs Icon elements
|
| 26 |
- Overall averages
|
| 27 |
- **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
|
| 28 |
-
- **
|
| 29 |
|
| 30 |
## Installation
|
| 31 |
|
|
@@ -57,14 +57,22 @@ The app will open in your browser at `http://localhost:8501`
|
|
| 57 |
4. **Explore Details**:
|
| 58 |
- Expand "Model Details" to see training metadata
|
| 59 |
- Expand "Detailed UI Type Breakdown" for a comprehensive table
|
| 60 |
-
- Expand "
|
| 61 |
|
| 62 |
## Data Source
|
| 63 |
|
| 64 |
-
The app
|
| 65 |
- Repository: `mlfoundations-cua-dev/leaderboard`
|
| 66 |
- Path: `grounding/[dataset_name]/[model_results].json`
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
## Supported Datasets
|
| 69 |
|
| 70 |
- **ScreenSpot-v2**: Web and desktop UI element grounding
|
|
@@ -82,4 +90,4 @@ For ScreenSpot-v2, the following baselines are included:
|
|
| 82 |
|
| 83 |
## Caching
|
| 84 |
|
| 85 |
-
Results are cached for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.
|
|
|
|
| 17 |
|
| 18 |
## Features
|
| 19 |
|
| 20 |
+
- **Real-time Data**: Streams results directly from the HuggingFace leaderboard repository without local storage
|
| 21 |
- **Interactive Visualizations**: Bar charts comparing model performance across different metrics
|
| 22 |
- **Baseline Comparisons**: Shows baseline models (Qwen2-VL, UI-TARS) alongside evaluated models
|
| 23 |
- **UI Type Breakdown**: For ScreenSpot datasets, shows performance by:
|
|
|
|
| 25 |
- Text vs Icon elements
|
| 26 |
- Overall averages
|
| 27 |
- **Model Details**: View training loss, checkpoint steps, and evaluation timestamps
|
| 28 |
+
- **Sample Results**: Inspect the first 5 evaluation samples for each model
|
| 29 |
|
| 30 |
## Installation
|
| 31 |
|
|
|
|
| 57 |
4. **Explore Details**:
|
| 58 |
- Expand "Model Details" to see training metadata
|
| 59 |
- Expand "Detailed UI Type Breakdown" for a comprehensive table
|
| 60 |
+
- Expand "Sample Results" to see the first 5 evaluation samples
|
| 61 |
|
| 62 |
## Data Source
|
| 63 |
|
| 64 |
+
The app streams data directly from the HuggingFace dataset repository:
|
| 65 |
- Repository: `mlfoundations-cua-dev/leaderboard`
|
| 66 |
- Path: `grounding/[dataset_name]/[model_results].json`
|
| 67 |
|
| 68 |
+
## Streaming Approach
|
| 69 |
+
|
| 70 |
+
To minimize local storage requirements, the app:
|
| 71 |
+
- Streams JSON files directly from HuggingFace Hub
|
| 72 |
+
- Extracts only the necessary data for visualization
|
| 73 |
+
- Discards the full JSON after processing
|
| 74 |
+
- Caches the extracted data in memory for 5 minutes
|
| 75 |
+
|
| 76 |
## Supported Datasets
|
| 77 |
|
| 78 |
- **ScreenSpot-v2**: Web and desktop UI element grounding
|
|
|
|
| 90 |
|
| 91 |
## Caching
|
| 92 |
|
| 93 |
+
Results are cached in memory for 5 minutes to improve performance. The cache automatically refreshes to show new evaluation results.
|
src/streamlit_app.py
CHANGED
|
@@ -5,7 +5,7 @@ os.environ["HF_HOME"] = "src/data_cache"
|
|
| 5 |
import streamlit as st
|
| 6 |
import pandas as pd
|
| 7 |
import altair as alt
|
| 8 |
-
from huggingface_hub import HfApi,
|
| 9 |
import json
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Dict, List, Optional
|
|
@@ -58,8 +58,9 @@ BASELINES = {
|
|
| 58 |
|
| 59 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 60 |
def fetch_leaderboard_data():
|
| 61 |
-
"""Fetch all grounding results from HuggingFace leaderboard."""
|
| 62 |
api = HfApi()
|
|
|
|
| 63 |
|
| 64 |
try:
|
| 65 |
# List all files in the grounding directory
|
|
@@ -67,19 +68,26 @@ def fetch_leaderboard_data():
|
|
| 67 |
grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
|
| 68 |
|
| 69 |
results = []
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
try:
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
repo_type="dataset"
|
| 77 |
-
)
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
data = json.load(f)
|
| 81 |
|
| 82 |
-
# Extract
|
| 83 |
metadata = data.get("metadata", {})
|
| 84 |
metrics = data.get("metrics", {})
|
| 85 |
detailed_results = data.get("detailed_results", {})
|
|
@@ -89,18 +97,30 @@ def fetch_leaderboard_data():
|
|
| 89 |
dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
|
| 90 |
|
| 91 |
# Get model name from metadata or path
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
| 93 |
if not model_name and len(path_parts) > 2:
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# Extract UI type results if available
|
| 97 |
ui_type_results = detailed_results.get("by_ui_type", {})
|
| 98 |
dataset_type_results = detailed_results.get("by_dataset_type", {})
|
| 99 |
|
| 100 |
-
|
|
|
|
| 101 |
"dataset": dataset_name,
|
| 102 |
"model": model_name,
|
| 103 |
-
"model_path":
|
| 104 |
"overall_accuracy": metrics.get("accuracy", 0) * 100, # Convert to percentage
|
| 105 |
"total_samples": metrics.get("total", 0),
|
| 106 |
"timestamp": metadata.get("evaluation_timestamp", ""),
|
|
@@ -108,13 +128,23 @@ def fetch_leaderboard_data():
|
|
| 108 |
"training_loss": metadata.get("training_loss"),
|
| 109 |
"ui_type_results": ui_type_results,
|
| 110 |
"dataset_type_results": dataset_type_results,
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
except Exception as e:
|
| 115 |
st.warning(f"Error loading {file_path}: {str(e)}")
|
| 116 |
continue
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
return pd.DataFrame(results)
|
| 119 |
|
| 120 |
except Exception as e:
|
|
@@ -347,11 +377,25 @@ def main():
|
|
| 347 |
st.dataframe(display_df, use_container_width=True)
|
| 348 |
|
| 349 |
# Raw data viewer
|
| 350 |
-
with st.expander("
|
| 351 |
if selected_model != 'All' and len(filtered_df) == 1:
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
else:
|
| 354 |
-
st.info("Select a specific model to view
|
| 355 |
|
| 356 |
if __name__ == "__main__":
|
| 357 |
main()
|
|
|
|
| 5 |
import streamlit as st
|
| 6 |
import pandas as pd
|
| 7 |
import altair as alt
|
| 8 |
+
from huggingface_hub import HfApi, HfFileSystem
|
| 9 |
import json
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Dict, List, Optional
|
|
|
|
| 58 |
|
| 59 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 60 |
def fetch_leaderboard_data():
|
| 61 |
+
"""Fetch all grounding results from HuggingFace leaderboard by streaming JSON files."""
|
| 62 |
api = HfApi()
|
| 63 |
+
fs = HfFileSystem()
|
| 64 |
|
| 65 |
try:
|
| 66 |
# List all files in the grounding directory
|
|
|
|
| 68 |
grounding_files = [f for f in files if f.startswith(f"{GROUNDING_PATH}/") and f.endswith(".json")]
|
| 69 |
|
| 70 |
results = []
|
| 71 |
+
|
| 72 |
+
# Create progress bar for loading
|
| 73 |
+
progress_bar = st.progress(0)
|
| 74 |
+
status_text = st.empty()
|
| 75 |
+
|
| 76 |
+
for idx, file_path in enumerate(grounding_files):
|
| 77 |
try:
|
| 78 |
+
# Update progress
|
| 79 |
+
progress = (idx + 1) / len(grounding_files)
|
| 80 |
+
progress_bar.progress(progress)
|
| 81 |
+
status_text.text(f"Loading {idx + 1}/{len(grounding_files)} files...")
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
# Stream the JSON file content directly from HuggingFace
|
| 84 |
+
file_url = f"datasets/{REPO_ID}/{file_path}"
|
| 85 |
+
|
| 86 |
+
# Read the file content directly without downloading
|
| 87 |
+
with fs.open(file_url, 'r') as f:
|
| 88 |
data = json.load(f)
|
| 89 |
|
| 90 |
+
# Extract only the necessary information
|
| 91 |
metadata = data.get("metadata", {})
|
| 92 |
metrics = data.get("metrics", {})
|
| 93 |
detailed_results = data.get("detailed_results", {})
|
|
|
|
| 97 |
dataset_name = path_parts[1] if len(path_parts) > 1 else "unknown"
|
| 98 |
|
| 99 |
# Get model name from metadata or path
|
| 100 |
+
model_checkpoint = metadata.get("model_checkpoint", "")
|
| 101 |
+
model_name = model_checkpoint.split('/')[-1]
|
| 102 |
+
|
| 103 |
+
# Handle checkpoint names
|
| 104 |
if not model_name and len(path_parts) > 2:
|
| 105 |
+
# Check if it's a checkpoint subdirectory structure
|
| 106 |
+
if len(path_parts) > 3 and path_parts[2] != path_parts[3]:
|
| 107 |
+
# Format: grounding/dataset/base_model/checkpoint.json
|
| 108 |
+
base_model = path_parts[2]
|
| 109 |
+
checkpoint_file = path_parts[3].replace(".json", "")
|
| 110 |
+
model_name = f"{base_model}/{checkpoint_file}"
|
| 111 |
+
else:
|
| 112 |
+
# Regular format: grounding/dataset/results_modelname.json
|
| 113 |
+
model_name = path_parts[2].replace("results_", "").replace(".json", "")
|
| 114 |
|
| 115 |
# Extract UI type results if available
|
| 116 |
ui_type_results = detailed_results.get("by_ui_type", {})
|
| 117 |
dataset_type_results = detailed_results.get("by_dataset_type", {})
|
| 118 |
|
| 119 |
+
# Create a compact result entry (only keep what we need for visualization)
|
| 120 |
+
result_entry = {
|
| 121 |
"dataset": dataset_name,
|
| 122 |
"model": model_name,
|
| 123 |
+
"model_path": model_checkpoint,
|
| 124 |
"overall_accuracy": metrics.get("accuracy", 0) * 100, # Convert to percentage
|
| 125 |
"total_samples": metrics.get("total", 0),
|
| 126 |
"timestamp": metadata.get("evaluation_timestamp", ""),
|
|
|
|
| 128 |
"training_loss": metadata.get("training_loss"),
|
| 129 |
"ui_type_results": ui_type_results,
|
| 130 |
"dataset_type_results": dataset_type_results,
|
| 131 |
+
# Store minimal sample results for inspection
|
| 132 |
+
"sample_results_summary": {
|
| 133 |
+
"total_samples": len(data.get("sample_results", [])),
|
| 134 |
+
"first_5_samples": data.get("sample_results", [])[:5]
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
results.append(result_entry)
|
| 139 |
|
| 140 |
except Exception as e:
|
| 141 |
st.warning(f"Error loading {file_path}: {str(e)}")
|
| 142 |
continue
|
| 143 |
|
| 144 |
+
# Clear progress indicators
|
| 145 |
+
progress_bar.empty()
|
| 146 |
+
status_text.empty()
|
| 147 |
+
|
| 148 |
return pd.DataFrame(results)
|
| 149 |
|
| 150 |
except Exception as e:
|
|
|
|
| 377 |
st.dataframe(display_df, use_container_width=True)
|
| 378 |
|
| 379 |
# Raw data viewer
|
| 380 |
+
with st.expander("Sample Results"):
|
| 381 |
if selected_model != 'All' and len(filtered_df) == 1:
|
| 382 |
+
summary = filtered_df.iloc[0]['sample_results_summary']
|
| 383 |
+
st.write(f"**Total evaluation samples:** {summary['total_samples']}")
|
| 384 |
+
st.write("**First 5 sample results:**")
|
| 385 |
+
for i, sample in enumerate(summary['first_5_samples'], 1):
|
| 386 |
+
st.write(f"\n**Sample {i}:**")
|
| 387 |
+
col1, col2 = st.columns([1, 2])
|
| 388 |
+
with col1:
|
| 389 |
+
st.write(f"- **Correct:** {'✅' if sample.get('is_correct') else '❌'}")
|
| 390 |
+
st.write(f"- **Image:** {sample.get('img_filename', 'N/A')}")
|
| 391 |
+
with col2:
|
| 392 |
+
st.write(f"- **Instruction:** {sample.get('instruction', 'N/A')}")
|
| 393 |
+
if sample.get('predicted_click'):
|
| 394 |
+
st.write(f"- **Predicted Click:** {sample['predicted_click']}")
|
| 395 |
+
if sample.get('error_msg'):
|
| 396 |
+
st.write(f"- **Error:** {sample['error_msg']}")
|
| 397 |
else:
|
| 398 |
+
st.info("Select a specific model to view sample results")
|
| 399 |
|
| 400 |
if __name__ == "__main__":
|
| 401 |
main()
|