from dash import Dash, html, dcc, Input, Output, State from dash import Dash, html, dcc, Input, Output, State import pandas as pd import dash_mantine_components as dmc import duckdb import time from graphs.leaderboard import ( button_style, get_top_n_leaderboard, render_table_content, ) from dash_iconify import DashIconify # Initialize the app app = Dash() server = app.server def load_parquet_to_duckdb(con, parquet_url, view_name): """ Loads a parquet file from a remote URL into DuckDB as a view. Returns (start_dt, end_dt) for the 'time' column. """ # Install and load httpfs extension for remote file access con.execute("INSTALL httpfs;") con.execute("LOAD httpfs;") # Create a view that references the remote parquet file con.execute(f""" CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM read_parquet('{parquet_url}') """) # Get time range for slider time_range = con.execute( f"SELECT MIN(time) as min_time, MAX(time) as max_time FROM {view_name}" ).fetchdf() start_dt = pd.to_datetime(time_range["min_time"].iloc[0]) end_dt = pd.to_datetime(time_range["max_time"].iloc[0]) return start_dt, end_dt # DuckDB connection (global) con = duckdb.connect(database=":memory:", read_only=False) # Load parquet files from Hugging Face using DuckDB HF_DATASET_ID = "emsesc/open_model_evolution_data" hf_parquet_url_1 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/all_downloads_with_annotations.parquet" hf_parquet_url_2 = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/one_year_rolling.parquet" print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}") try: overall_start_time = time.time() # Load both parquet files as views start_dt, end_dt = load_parquet_to_duckdb(con, hf_parquet_url_1, "all_downloads") # Example: load a second parquet file as another view start_dt2, end_dt2 = load_parquet_to_duckdb(con, hf_parquet_url_2, "one_year_rolling") msg = ( f"Successfully connected to datasets in {time.time() - overall_start_time:.2f}s." ) print(msg) except Exception as e: err_msg = f"Failed to load dataset(s). Error: {e}" print(err_msg) raise # Create a dcc slider for time range selection by year (readable marks) start_ts = int(start_dt.timestamp()) end_ts = int(end_dt.timestamp()) def ordinal(n): # Helper to get ordinal suffix for a day if 10 <= n % 100 <= 20: suffix = 'th' else: suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th') return f"{n}{suffix}" def format_date(dt): # Format date as "Oct 8th, 2025" return dt.strftime("%b") + f" {ordinal(dt.day)}, {dt.year}" marks = [] # Add start label (e.g. "Jan 2020") marks.append({"value": start_ts, "label": start_dt.strftime("%b %Y")}) # Add yearly marks between start and end (e.g. "2021", "2022") for yr in range(start_dt.year, end_dt.year + 1): yr_ts = int(pd.Timestamp(year=yr, month=1, day=1).timestamp()) start_yr = int(pd.Timestamp(year=start_dt.year, month=1, day=1).timestamp()) if yr_ts != start_yr and yr_ts != end_ts: marks.append({"value": yr_ts, "label": str(yr)}) # Add end label (e.g. "Dec 2024") marks.append({"value": end_ts, "label": end_dt.strftime("%b %Y")}) def get_thumb_labels(values): # Returns formatted labels for both thumbs distance = abs(values[1] - values[0]) close = distance < 4 * 30 * 86400 # 4 months label_style = { "background": "#fff", "color": "#082030", "fontWeight": "bold", "fontSize": "13px", "borderRadius": "8px", "padding": "2px 8px", "boxShadow": "0 1px 4px rgba(8,32,48,0.10)", "position": "absolute", "left": "50%", "transform": "translateX(-50%)", "whiteSpace": "nowrap", "zIndex": 100, } if close: # Move first label above, second label below (closer to slider) style_top_1 = label_style.copy() style_top_1["top"] = "-38px" style_top_2 = label_style.copy() style_top_2["top"] = "14px" return [ html.Div( format_date(pd.to_datetime(values[0], unit="s")), style=style_top_1, ), html.Div( format_date(pd.to_datetime(values[1], unit="s")), style=style_top_2, ), ] else: # Both labels below the slider (closer to slider) style_top_1 = label_style.copy() style_top_1["top"] = "14px" style_top_2 = label_style.copy() style_top_2["top"] = "14px" return [ html.Div( format_date(pd.to_datetime(values[0], unit="s")), style=style_top_1, ), html.Div( format_date(pd.to_datetime(values[1], unit="s")), style=style_top_2, ), ] # Create a dcc slider for time range selection by year time_slider = dmc.RangeSlider( id="time-slider", min=start_ts, max=end_ts, value=[ start_ts, end_ts, ], step=24 * 60 * 60, color="#AC482A", size="md", radius="xl", marks=marks, style={"width": "95%", "paddingLeft": "60px"}, # updated paddingLeft label=None, showLabelOnHover=False, labelTransitionProps={"transition": "fade", "duration": 150}, thumbChildren=get_thumb_labels([start_ts, end_ts]), ) # Add a dcc.Store to hold the selected view (all_downloads or one_year_rolling) app.layout = dmc.MantineProvider( theme={ "colorScheme": "light", "primaryColor": "blue", "fontFamily": "Inter, sans-serif", }, children=[ dcc.Store(id="selected-view", data="all_downloads"), dcc.Store(id="derived-author-toggle", data=True), # Store for toggle state html.Div( [ # Header html.Div( [ html.Div( [ html.Div( [ html.Div( children="Economies of Open Intelligence", style={ "fontSize": 22, "fontWeight": "700", "lineHeight": "1.1", }, ), html.Div( children="Tracing Power & Participation in the Model Ecosystem", style={ "fontSize": 13, "marginTop": 6, "opacity": 0.9, }, ), ], style={ "display": "flex", "flexDirection": "column", "justifyContent": "center", }, ), html.Div( [ html.A( children=[ html.Img( src="assets/images/dpi.svg", style={ "height": "28px", "verticalAlign": "middle", "paddingRight": "8px", }, ), "Data Provenance Initiative", ], href="https://www.dataprovenance.org/", target="_blank", className="no-bg-link header-link", style={ "display": "inline-block", "padding": "6px 14px", "fontSize": 13, "color": "#FFFFFF", # white on dark header # background removed so CSS controls it "borderRadius": "18px", "fontWeight": "700", "textDecoration": "none", "marginRight": "12px", }, ), html.A( children=[ html.Img( src="assets/images/hf.svg", style={ "height": "30px", "verticalAlign": "middle", }, ), html.Span( "Hugging Face", className="hf-brand-text", ), ], href="https://huggingface.co/", target="_blank", className="no-bg-link header-link", style={ "display": "inline-flex", "padding": "6px 14px", "alignItems": "center", "color": "#FFFFFF", "borderRadius": "18px", "textDecoration": "none", "marginRight": "12px", }, ), html.A( children=[ html.Span( "Read the paper", className="paper-text", ), ], href="https://www.google.com/", target="_blank", className="no-bg-link header-link paper-link", style={ "display": "inline-flex", "alignItems": "center", "padding": "6px 12px", # decreased size "fontSize": 14, # smaller text "margin": "0 auto", "backgroundColor": "#AC482A", "color": "#FFFFFF", "borderRadius": "5px", "textDecoration": "none", "fontWeight": "700", }, ), ], style={"display": "flex", "alignItems": "center"}, ), ], style={ "marginLeft": "50px", "marginRight": "50px", "display": "flex", "justifyContent": "space-between", "alignItems": "center", "padding": "18px 24px", "gap": "24px", }, ), ], style={ "backgroundColor": "#082030", "color": "white", "width": "100%", }, ), # Intro / description below header (kept but styled to match layout) # Title html.Div( children="The Open Model Leaderboard", style={ "fontSize": 40, "fontWeight": "700", "textAlign": "center", "marginTop": 20, "marginBottom": 20, }, ), html.Div( children="This leaderboard assesses concentrations of power in the open model ecosystem across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.", style={ "fontSize": 14, "marginTop": 18, "marginBottom": 12, "marginLeft": 100, "marginRight": 100, "textAlign": "center", }, ), # Main content (filters + tabs) html.Div( children=[ html.Div( [ html.Div( "Select Download View", style={ "fontWeight": "700", "marginBottom": 8, "fontSize": 14, }, ), dmc.SegmentedControl( id="segmented", value="all-downloads", color="#AC482A", transitionDuration=200, data=[ { "value": "all-downloads", "label": "All Downloads", }, { "value": "filtered-downloads", "label": "Filtered Downloads", }, ], mb=10, ), html.Div( "Choose whether to view all downloads or only those within one year of the model's creation date.", style={ "fontSize": 13, "color": "#555", "marginBottom": "12px", }, ), # New segmented control below the first one html.Div( [ html.Div( "Select Author Type", style={ "fontWeight": "700", "marginBottom": 8, "fontSize": 14, }, ), dmc.Switch( id="derived-author-switch", # <-- add id color="#AC482A", label="Derived Authors", checked=True, mb=10, ), html.Div( "Toggle between viewing downloads by original authors or derived authors (those who forked or adapted models).", style={ "fontSize": 13, "color": "#555", "marginBottom": "12px", }, ), ], style={"marginTop": "10px"}, ), html.Span( id="global-toggle-status", style={ "marginLeft": "8px", "display": "inline-block", "marginTop": 6, }, ), ], style={"flex": 1, "minWidth": "220px"}, ), html.Div( [ html.Div( "Select Time Range", style={ "fontWeight": "700", "marginBottom": 8, "fontSize": 14, }, ), time_slider, html.Div( "Adjust the time range to filter leaderboard results by model download times.", style={ "fontSize": 13, "color": "#555", "marginTop": "32px", # increased from 24px }, ), # Tip section html.Div( [ html.Div( [ DashIconify( icon="mdi:lightbulb-on-outline", width=20, height=20, style={"marginRight": "8px", "color": "#082030"}, ), html.Span("Tip"), ], style={ "fontWeight": "700", "fontSize": 15, "marginBottom": "6px", "color": "#082030", "display": "flex", "alignItems": "center", }, ), html.Div( [ "Try switching between ", html.Span("All Downloads", style={"fontWeight": "600", "color": "#AC482A"}), " and ", html.Span("Filtered Downloads", style={"fontWeight": "600", "color": "#AC482A"}), " to compare overall popularity versus early interest after model release. ", "You can also toggle ON ", html.Span("Derived Authors", style={"fontWeight": "600", "color": "#AC482A"}), " to see how derivative works contribute to developer influence.", ], style={ "fontSize": 13, "color": "#082030", "lineHeight": "1.6", }, ), ], style={ "backgroundColor": "#F5ECE6", "borderRadius": "14px", "padding": "18px 20px", "marginTop": "28px", "boxShadow": "0 1px 4px rgba(8,32,48,0.04)", "border": "1px solid #f0e3d6", }, ), ], style={ "flex": 2, "minWidth": "320px", "display": "flex", "flexDirection": "column", "justifyContent": "center", "height": "100%", }, ), ], style={ "display": "flex", "gap": "24px", "padding": "32px", "alignItems": "flex-start", "marginLeft": "100px", "marginRight": "100px", "backgroundColor": "#FFFBF9", "borderRadius": "18px", }, ), html.Div( [ dcc.Tabs( id="leaderboard-tabs", value="Countries", children=[ dcc.Tab( label="Countries", value="Countries", style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "color": "#6B7280", "fontWeight": "500", }, selected_style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "fontWeight": "700", "borderBottom": "3px solid #082030", }, children=[ html.Div( children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.", style={ "fontSize": 14, "marginTop": 18, "marginBottom": 12, "textAlign": "left", }, ), dcc.Loading( id="loading-countries", type="circle", color="#AC482A", children=html.Div(id="top_countries-table") ), html.Button( id="top_countries-toggle", children="▼ Show Top 50", n_clicks=0, style={**button_style, "border": "none"}, ), ], ), dcc.Tab( label="Developers", value="Developers", style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "color": "#6B7280", "fontWeight": "500", }, selected_style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "fontWeight": "700", "borderBottom": "3px solid #082030", }, children=[ html.Div( children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.", style={ "fontSize": 14, "marginTop": 18, "marginBottom": 12, "textAlign": "left", }, ), dcc.Loading( id="loading-developers", type="circle", color="#AC482A", children=html.Div(id="top_developers-table") ), html.Button( id="top_developers-toggle", children="▼ Show Top 50", n_clicks=0, style={**button_style, "border": "none"}, ), ], ), dcc.Tab( label="Models", value="Models", style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "color": "#6B7280", "fontWeight": "500", }, selected_style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "fontWeight": "700", "borderBottom": "3px solid #082030", }, children=[ html.Div( children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.", style={ "fontSize": 14, "marginTop": 18, "marginBottom": 12, "textAlign": "left", }, ), dcc.Loading( id="loading-models", type="circle", color="#AC482A", children=html.Div(id="top_models-table") ), html.Button( id="top_models-toggle", children="▼ Show Top 50", n_clicks=0, style={**button_style, "border": "none"}, ), ], ), ], ), ], style={ "borderRadius": "18px", "padding": "32px", "marginTop": "12px", "marginBottom": "12px", # reduced from 64px "marginLeft": "50px", "marginRight": "50px", }, ), ], style={ "fontFamily": "Inter", "backgroundColor": "#ffffff", "minHeight": "100vh", }, ) ], ) # Callbacks for interactivity # -- helper utilities to consolidate duplicated callback logic -- def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view="all_downloads"): """ Query DuckDB directly to get top N entries with metadata This minimizes data transfer by doing aggregation in DuckDB """ # Build time filter clause time_clause = "" if slider_value and len(slider_value) == 2: start = pd.to_datetime(slider_value[0], unit="s") end = pd.to_datetime(slider_value[1], unit="s") time_clause = f"WHERE time >= '{start}' AND time <= '{end}'" # Build the aggregation query to get top N with all needed metadata # This query groups by the target column and aggregates downloads # while collecting all metadata we need for chips query = f""" WITH base_data AS ( SELECT {group_col}, CASE WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' WHEN org_country_single IN ('International', 'Online') THEN 'International/Online' ELSE org_country_single END AS org_country_single, author, derived_author, merged_country_groups_single, merged_modality, downloads, model FROM {view} {time_clause} ), -- Compute the total downloads for all rows in the time range total_downloads_cte AS ( SELECT SUM(downloads) AS total_downloads_all FROM base_data ), -- Compute per-group totals and their percentage of all downloads top_items AS ( SELECT b.{group_col} AS name, SUM(b.downloads) AS total_downloads, ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total, -- Pick first non-null metadata values for reference ANY_VALUE(b.org_country_single) AS org_country_single, ANY_VALUE(b.author) AS author, ANY_VALUE(b.derived_author) AS derived_author, ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single, ANY_VALUE(b.merged_modality) AS merged_modality, ANY_VALUE(b.model) AS model FROM base_data b CROSS JOIN total_downloads_cte t GROUP BY b.{group_col}, t.total_downloads_all ) SELECT * FROM top_items ORDER BY total_downloads DESC LIMIT {top_n}; """ return con.execute(query).fetchdf() def _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col, filename, default_label="▼ Show Top 50", chip_color="#F0F9FF", view="all_downloads", derived_author_toggle=True, ): # Normalize label on first load if current_label is None: current_label = default_label # Determine top_n and next label if n_clicks == 0: top_n = 10 new_label = current_label elif "Show Top 50" in current_label: top_n, new_label = 50, "▼ Show Top 100" elif "Show Top 100" in current_label: top_n, new_label = 100, "▲ Show Less" else: top_n, new_label = 10, "▼ Show Top 50" # Get filtered and aggregated data directly from DuckDB df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view=view) # Process the already-filtered data - pass derived_author_toggle df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n, derived_author_toggle=derived_author_toggle) return render_table_content( df, download_df, chip_color=chip_color, filename=filename ), new_label # -- end helpers -- # --- Callback to store derived author toggle state --- @app.callback( Output("derived-author-toggle", "data"), Input("derived-author-switch", "checked"), ) def update_derived_author_toggle(checked): return checked # Callbacks for interactivity (modularized) @app.callback( Output("top_countries-table", "children"), Output("top_countries-toggle", "children"), Input("top_countries-toggle", "n_clicks"), Input("time-slider", "value"), Input("selected-view", "data"), Input("derived-author-toggle", "data"), State("top_countries-toggle", "children"), ) def update_top_countries(n_clicks, slider_value, selected_view, derived_author_toggle, current_label): return _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col="org_country_single", filename="top_countries", default_label="▼ Show Top 50", chip_color="#F0F9FF", view=selected_view, derived_author_toggle=derived_author_toggle, ) @app.callback( Output("top_developers-table", "children"), Output("top_developers-toggle", "children"), Input("top_developers-toggle", "n_clicks"), Input("time-slider", "value"), Input("selected-view", "data"), Input("derived-author-toggle", "data"), State("top_developers-toggle", "children"), ) def update_top_developers(n_clicks, slider_value, selected_view, derived_author_toggle, current_label): # Use derived_author if toggle is True, else author group_col = "derived_author" if derived_author_toggle else "author" return _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col=group_col, filename="top_developers", default_label="▼ Show Top 50", chip_color="#F0F9FF", view=selected_view, derived_author_toggle=derived_author_toggle, ) @app.callback( Output("top_models-table", "children"), Output("top_models-toggle", "children"), Input("top_models-toggle", "n_clicks"), Input("time-slider", "value"), Input("selected-view", "data"), Input("derived-author-toggle", "data"), State("top_models-toggle", "children"), ) def update_top_models(n_clicks, slider_value, selected_view, derived_author_toggle, current_label): return _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col="model", filename="top_models", default_label="▼ Show More", chip_color="#F0F9FF", view=selected_view, derived_author_toggle=derived_author_toggle, ) @app.callback( Output("time-slider", "thumbChildren"), Input("time-slider", "value"), ) def update_thumb_labels(values): return get_thumb_labels(values) # --- Add callback to update selected view based on segmented control --- @app.callback( Output("selected-view", "data"), Input("segmented", "value"), ) def update_selected_view(seg_value): if seg_value == "filtered-downloads": return "one_year_rolling" return "all_downloads" # Run the app if __name__ == "__main__": app.run(debug=True)