from dash import Dash, html, dcc, Input, Output, State
from dash import Dash, html, dcc, Input, Output, State
import pandas as pd
import dash_mantine_components as dmc
import duckdb
import time
from graphs.leaderboard import (
button_style,
get_top_n_leaderboard,
render_table_content,
)
from dash_iconify import DashIconify
# Initialize the app
app = Dash()
server = app.server
def load_parquet_to_duckdb(con, parquet_url, view_name):
"""
Loads a parquet file from a remote URL into DuckDB as a view.
Returns (start_dt, end_dt) for the 'time' column.
"""
# Install and load httpfs extension for remote file access
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")
# Create a view that references the remote parquet file
con.execute(f"""
CREATE OR REPLACE VIEW {view_name} AS
SELECT * FROM read_parquet('{parquet_url}')
""")
# Get time range for slider
time_range = con.execute(
f"SELECT MIN(time) as min_time, MAX(time) as max_time FROM {view_name}"
).fetchdf()
start_dt = pd.to_datetime(time_range["min_time"].iloc[0])
end_dt = pd.to_datetime(time_range["max_time"].iloc[0])
return start_dt, end_dt
# DuckDB connection (global)
con = duckdb.connect(database=":memory:", read_only=False)
# Load parquet files from Hugging Face using DuckDB
HF_DATASET_ID = "emsesc/open_model_evolution_data"
hf_parquet_url_1 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/all_downloads_with_annotations.parquet"
hf_parquet_url_2 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/one_year_rolling.parquet"
print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}")
try:
overall_start_time = time.time()
# Load both parquet files as views
start_dt, end_dt = load_parquet_to_duckdb(con, hf_parquet_url_1, "all_downloads")
# Example: load a second parquet file as another view
start_dt2, end_dt2 = load_parquet_to_duckdb(con, hf_parquet_url_2, "one_year_rolling")
msg = (
f"Successfully connected to datasets in {time.time() - overall_start_time:.2f}s."
)
print(msg)
except Exception as e:
err_msg = f"Failed to load dataset(s). Error: {e}"
print(err_msg)
raise
# Create a dcc slider for time range selection by year (readable marks)
start_ts = int(start_dt.timestamp())
end_ts = int(end_dt.timestamp())
def ordinal(n):
# Helper to get ordinal suffix for a day
if 10 <= n % 100 <= 20:
suffix = 'th'
else:
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
return f"{n}{suffix}"
def format_date(dt):
# Format date as "Oct 8th, 2025"
return dt.strftime("%b") + f" {ordinal(dt.day)}, {dt.year}"
marks = []
# Add start label (e.g. "Jan 2020")
marks.append({"value": start_ts, "label": start_dt.strftime("%b %Y")})
# Add yearly marks between start and end (e.g. "2021", "2022")
for yr in range(start_dt.year, end_dt.year + 1):
yr_ts = int(pd.Timestamp(year=yr, month=1, day=1).timestamp())
start_yr = int(pd.Timestamp(year=start_dt.year, month=1, day=1).timestamp())
if yr_ts != start_yr and yr_ts != end_ts:
marks.append({"value": yr_ts, "label": str(yr)})
# Add end label (e.g. "Dec 2024")
marks.append({"value": end_ts, "label": end_dt.strftime("%b %Y")})
def get_thumb_labels(values):
# Returns formatted labels for both thumbs
distance = abs(values[1] - values[0])
close = distance < 4 * 30 * 86400 # 4 months
label_style = {
"background": "#fff",
"color": "#082030",
"fontWeight": "bold",
"fontSize": "13px",
"borderRadius": "8px",
"padding": "2px 8px",
"boxShadow": "0 1px 4px rgba(8,32,48,0.10)",
"position": "absolute",
"left": "50%",
"transform": "translateX(-50%)",
"whiteSpace": "nowrap",
"zIndex": 100,
}
if close:
# Move first label above, second label below (closer to slider)
style_top_1 = label_style.copy()
style_top_1["top"] = "-38px"
style_top_2 = label_style.copy()
style_top_2["top"] = "14px"
return [
html.Div(
format_date(pd.to_datetime(values[0], unit="s")),
style=style_top_1,
),
html.Div(
format_date(pd.to_datetime(values[1], unit="s")),
style=style_top_2,
),
]
else:
# Both labels below the slider (closer to slider)
style_top_1 = label_style.copy()
style_top_1["top"] = "14px"
style_top_2 = label_style.copy()
style_top_2["top"] = "14px"
return [
html.Div(
format_date(pd.to_datetime(values[0], unit="s")),
style=style_top_1,
),
html.Div(
format_date(pd.to_datetime(values[1], unit="s")),
style=style_top_2,
),
]
# Create a dcc slider for time range selection by year
time_slider = dmc.RangeSlider(
id="time-slider",
min=start_ts,
max=end_ts,
value=[
start_ts,
end_ts,
],
step=24 * 60 * 60,
color="#AC482A",
size="md",
radius="xl",
marks=marks,
style={"width": "95%", "paddingLeft": "60px"}, # updated paddingLeft
label=None,
showLabelOnHover=False,
labelTransitionProps={"transition": "fade", "duration": 150},
thumbChildren=get_thumb_labels([start_ts, end_ts]),
)
# Add a dcc.Store to hold the selected view (all_downloads or one_year_rolling)
app.layout = dmc.MantineProvider(
theme={
"colorScheme": "light",
"primaryColor": "blue",
"fontFamily": "Inter, sans-serif",
},
children=[
dcc.Store(id="selected-view", data="all_downloads"),
dcc.Store(id="derived-author-toggle", data=True), # Store for toggle state
html.Div(
[
# Header
html.Div(
[
html.Div(
[
html.Div(
[
html.Div(
children="Economies of Open Intelligence",
style={
"fontSize": 22,
"fontWeight": "700",
"lineHeight": "1.1",
},
),
html.Div(
children="Tracing Power & Participation in the Model Ecosystem",
style={
"fontSize": 13,
"marginTop": 6,
"opacity": 0.9,
},
),
],
style={
"display": "flex",
"flexDirection": "column",
"justifyContent": "center",
},
),
html.Div(
[
html.A(
children=[
html.Img(
src="assets/images/dpi.svg",
style={
"height": "28px",
"verticalAlign": "middle",
"paddingRight": "8px",
},
),
"Data Provenance Initiative",
],
href="https://www.dataprovenance.org/",
target="_blank",
className="no-bg-link header-link",
style={
"display": "inline-block",
"padding": "6px 14px",
"fontSize": 13,
"color": "#FFFFFF", # white on dark header
# background removed so CSS controls it
"borderRadius": "18px",
"fontWeight": "700",
"textDecoration": "none",
"marginRight": "12px",
},
),
html.A(
children=[
html.Img(
src="assets/images/hf.svg",
style={
"height": "30px",
"verticalAlign": "middle",
},
),
html.Span(
"Hugging Face",
className="hf-brand-text",
),
],
href="https://huggingface.co/",
target="_blank",
className="no-bg-link header-link",
style={
"display": "inline-flex",
"padding": "6px 14px",
"alignItems": "center",
"color": "#FFFFFF",
"borderRadius": "18px",
"textDecoration": "none",
"marginRight": "12px",
},
),
html.A(
children=[
html.Span(
"Read the paper",
className="paper-text",
),
],
href="https://www.google.com/",
target="_blank",
className="no-bg-link header-link paper-link",
style={
"display": "inline-flex",
"alignItems": "center",
"padding": "6px 12px", # decreased size
"fontSize": 14, # smaller text
"margin": "0 auto",
"backgroundColor": "#AC482A",
"color": "#FFFFFF",
"borderRadius": "5px",
"textDecoration": "none",
"fontWeight": "700",
},
),
],
style={"display": "flex", "alignItems": "center"},
),
],
style={
"marginLeft": "50px",
"marginRight": "50px",
"display": "flex",
"justifyContent": "space-between",
"alignItems": "center",
"padding": "18px 24px",
"gap": "24px",
},
),
],
style={
"backgroundColor": "#082030",
"color": "white",
"width": "100%",
},
),
# Intro / description below header (kept but styled to match layout)
# Title
html.Div(
children="The Open Model Leaderboard",
style={
"fontSize": 40,
"fontWeight": "700",
"textAlign": "center",
"marginTop": 20,
"marginBottom": 20,
},
),
html.Div(
children="This leaderboard assesses concentrations of power in the open model ecosystem across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"marginLeft": 100,
"marginRight": 100,
"textAlign": "center",
},
),
# Main content (filters + tabs)
html.Div(
children=[
html.Div(
[
html.Div(
"Select Download View",
style={
"fontWeight": "700",
"marginBottom": 8,
"fontSize": 14,
},
),
dmc.SegmentedControl(
id="segmented",
value="all-downloads",
color="#AC482A",
transitionDuration=200,
data=[
{
"value": "all-downloads",
"label": "All Downloads",
},
{
"value": "filtered-downloads",
"label": "Filtered Downloads",
},
],
mb=10,
),
html.Div(
"Choose whether to view all downloads or only those within one year of the model's creation date.",
style={
"fontSize": 13,
"color": "#555",
"marginBottom": "12px",
},
),
# New segmented control below the first one
html.Div(
[
html.Div(
"Select Author Type",
style={
"fontWeight": "700",
"marginBottom": 8,
"fontSize": 14,
},
),
dmc.Switch(
id="derived-author-switch", # <-- add id
color="#AC482A",
label="Derived Authors",
checked=True,
mb=10,
),
html.Div(
"Toggle between viewing downloads by original authors or derived authors (those who forked or adapted models).",
style={
"fontSize": 13,
"color": "#555",
"marginBottom": "12px",
},
),
],
style={"marginTop": "10px"},
),
html.Span(
id="global-toggle-status",
style={
"marginLeft": "8px",
"display": "inline-block",
"marginTop": 6,
},
),
],
style={"flex": 1, "minWidth": "220px"},
),
html.Div(
[
html.Div(
"Select Time Range",
style={
"fontWeight": "700",
"marginBottom": 8,
"fontSize": 14,
},
),
time_slider,
html.Div(
"Adjust the time range to filter leaderboard results by model download times.",
style={
"fontSize": 13,
"color": "#555",
"marginTop": "32px", # increased from 24px
},
),
# Tip section
html.Div(
[
html.Div(
[
DashIconify(
icon="mdi:lightbulb-on-outline",
width=20,
height=20,
style={"marginRight": "8px", "color": "#082030"},
),
html.Span("Tip"),
],
style={
"fontWeight": "700",
"fontSize": 15,
"marginBottom": "6px",
"color": "#082030",
"display": "flex",
"alignItems": "center",
},
),
html.Div(
[
"Try switching between ",
html.Span("All Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
" and ",
html.Span("Filtered Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
" to compare overall popularity versus early interest after model release. ",
"You can also toggle ON ",
html.Span("Derived Authors", style={"fontWeight": "600", "color": "#AC482A"}),
" to see how derivative works contribute to developer influence.",
],
style={
"fontSize": 13,
"color": "#082030",
"lineHeight": "1.6",
},
),
],
style={
"backgroundColor": "#F5ECE6",
"borderRadius": "14px",
"padding": "18px 20px",
"marginTop": "28px",
"boxShadow": "0 1px 4px rgba(8,32,48,0.04)",
"border": "1px solid #f0e3d6",
},
),
],
style={
"flex": 2,
"minWidth": "320px",
"display": "flex",
"flexDirection": "column",
"justifyContent": "center",
"height": "100%",
},
),
],
style={
"display": "flex",
"gap": "24px",
"padding": "32px",
"alignItems": "flex-start",
"marginLeft": "100px",
"marginRight": "100px",
"backgroundColor": "#FFFBF9",
"borderRadius": "18px",
},
),
html.Div(
[
dcc.Tabs(
id="leaderboard-tabs",
value="Countries",
children=[
dcc.Tab(
label="Countries",
value="Countries",
style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"color": "#6B7280",
"fontWeight": "500",
},
selected_style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"fontWeight": "700",
"borderBottom": "3px solid #082030",
},
children=[
html.Div(
children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"textAlign": "left",
},
),
dcc.Loading(
id="loading-countries",
type="circle",
color="#AC482A",
children=html.Div(id="top_countries-table")
),
html.Button(
id="top_countries-toggle",
children="▼ Show Top 50",
n_clicks=0,
style={**button_style, "border": "none"},
),
],
),
dcc.Tab(
label="Developers",
value="Developers",
style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"color": "#6B7280",
"fontWeight": "500",
},
selected_style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"fontWeight": "700",
"borderBottom": "3px solid #082030",
},
children=[
html.Div(
children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"textAlign": "left",
},
),
dcc.Loading(
id="loading-developers",
type="circle",
color="#AC482A",
children=html.Div(id="top_developers-table")
),
html.Button(
id="top_developers-toggle",
children="▼ Show Top 50",
n_clicks=0,
style={**button_style, "border": "none"},
),
],
),
dcc.Tab(
label="Models",
value="Models",
style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"color": "#6B7280",
"fontWeight": "500",
},
selected_style={
"backgroundColor": "transparent",
"border": "none",
"padding": "10px 18px",
"fontWeight": "700",
"borderBottom": "3px solid #082030",
},
children=[
html.Div(
children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
style={
"fontSize": 14,
"marginTop": 18,
"marginBottom": 12,
"textAlign": "left",
},
),
dcc.Loading(
id="loading-models",
type="circle",
color="#AC482A",
children=html.Div(id="top_models-table")
),
html.Button(
id="top_models-toggle",
children="▼ Show Top 50",
n_clicks=0,
style={**button_style, "border": "none"},
),
],
),
],
),
],
style={
"borderRadius": "18px",
"padding": "32px",
"marginTop": "12px",
"marginBottom": "12px", # reduced from 64px
"marginLeft": "50px",
"marginRight": "50px",
},
),
],
style={
"fontFamily": "Inter",
"backgroundColor": "#ffffff",
"minHeight": "100vh",
},
)
],
)
# Callbacks for interactivity
# -- helper utilities to consolidate duplicated callback logic --
def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view="all_downloads"):
"""
Query DuckDB directly to get top N entries with metadata
This minimizes data transfer by doing aggregation in DuckDB
"""
# Build time filter clause
time_clause = ""
if slider_value and len(slider_value) == 2:
start = pd.to_datetime(slider_value[0], unit="s")
end = pd.to_datetime(slider_value[1], unit="s")
time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"
# Build the aggregation query to get top N with all needed metadata
# This query groups by the target column and aggregates downloads
# while collecting all metadata we need for chips
query = f"""
WITH base_data AS (
SELECT
{group_col},
CASE
WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
WHEN org_country_single IN ('International', 'Online') THEN 'International/Online'
ELSE org_country_single
END AS org_country_single,
author,
derived_author,
merged_country_groups_single,
merged_modality,
downloads,
model
FROM {view}
{time_clause}
),
-- Compute the total downloads for all rows in the time range
total_downloads_cte AS (
SELECT SUM(downloads) AS total_downloads_all
FROM base_data
),
-- Compute per-group totals and their percentage of all downloads
top_items AS (
SELECT
b.{group_col} AS name,
SUM(b.downloads) AS total_downloads,
ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
-- Pick first non-null metadata values for reference
ANY_VALUE(b.org_country_single) AS org_country_single,
ANY_VALUE(b.author) AS author,
ANY_VALUE(b.derived_author) AS derived_author,
ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
ANY_VALUE(b.merged_modality) AS merged_modality,
ANY_VALUE(b.model) AS model
FROM base_data b
CROSS JOIN total_downloads_cte t
GROUP BY b.{group_col}, t.total_downloads_all
)
SELECT *
FROM top_items
ORDER BY total_downloads DESC
LIMIT {top_n};
"""
return con.execute(query).fetchdf()
def _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col,
filename,
default_label="▼ Show Top 50",
chip_color="#F0F9FF",
view="all_downloads",
derived_author_toggle=True,
):
# Normalize label on first load
if current_label is None:
current_label = default_label
# Determine top_n and next label
if n_clicks == 0:
top_n = 10
new_label = current_label
elif "Show Top 50" in current_label:
top_n, new_label = 50, "▼ Show Top 100"
elif "Show Top 100" in current_label:
top_n, new_label = 100, "▲ Show Less"
else:
top_n, new_label = 10, "▼ Show Top 50"
# Get filtered and aggregated data directly from DuckDB
df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view=view)
# Process the already-filtered data - pass derived_author_toggle
df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n, derived_author_toggle=derived_author_toggle)
return render_table_content(
df, download_df, chip_color=chip_color, filename=filename
), new_label
# -- end helpers --
# --- Callback to store derived author toggle state ---
@app.callback(
Output("derived-author-toggle", "data"),
Input("derived-author-switch", "checked"),
)
def update_derived_author_toggle(checked):
return checked
# Callbacks for interactivity (modularized)
@app.callback(
Output("top_countries-table", "children"),
Output("top_countries-toggle", "children"),
Input("top_countries-toggle", "n_clicks"),
Input("time-slider", "value"),
Input("selected-view", "data"),
Input("derived-author-toggle", "data"),
State("top_countries-toggle", "children"),
)
def update_top_countries(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
return _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col="org_country_single",
filename="top_countries",
default_label="▼ Show Top 50",
chip_color="#F0F9FF",
view=selected_view,
derived_author_toggle=derived_author_toggle,
)
@app.callback(
Output("top_developers-table", "children"),
Output("top_developers-toggle", "children"),
Input("top_developers-toggle", "n_clicks"),
Input("time-slider", "value"),
Input("selected-view", "data"),
Input("derived-author-toggle", "data"),
State("top_developers-toggle", "children"),
)
def update_top_developers(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
# Use derived_author if toggle is True, else author
group_col = "derived_author" if derived_author_toggle else "author"
return _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col=group_col,
filename="top_developers",
default_label="▼ Show Top 50",
chip_color="#F0F9FF",
view=selected_view,
derived_author_toggle=derived_author_toggle,
)
@app.callback(
Output("top_models-table", "children"),
Output("top_models-toggle", "children"),
Input("top_models-toggle", "n_clicks"),
Input("time-slider", "value"),
Input("selected-view", "data"),
Input("derived-author-toggle", "data"),
State("top_models-toggle", "children"),
)
def update_top_models(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
return _leaderboard_callback_logic(
n_clicks,
slider_value,
current_label,
group_col="model",
filename="top_models",
default_label="▼ Show More",
chip_color="#F0F9FF",
view=selected_view,
derived_author_toggle=derived_author_toggle,
)
@app.callback(
Output("time-slider", "thumbChildren"),
Input("time-slider", "value"),
)
def update_thumb_labels(values):
return get_thumb_labels(values)
# --- Add callback to update selected view based on segmented control ---
@app.callback(
Output("selected-view", "data"),
Input("segmented", "value"),
)
def update_selected_view(seg_value):
if seg_value == "filtered-downloads":
return "one_year_rolling"
return "all_downloads"
# Run the app
if __name__ == "__main__":
app.run(debug=True)