|
|
from dash import Dash, html, dcc, Input, Output, State |
|
|
import pandas as pd |
|
|
import dash_mantine_components as dmc |
|
|
import duckdb |
|
|
import time |
|
|
from graphs.leaderboard import button_style, get_top_n_leaderboard, render_table_content |
|
|
from dash_iconify import DashIconify |
|
|
|
|
|
|
|
|
app = Dash() |
|
|
server = app.server |
|
|
|
|
|
|
|
|
|
|
|
def get_last_updated(): |
|
|
try: |
|
|
result = con.execute( |
|
|
"SELECT MAX(time) as max_time FROM all_downloads" |
|
|
).fetchdf() |
|
|
max_time = result["max_time"].iloc[0] |
|
|
if pd.isnull(max_time): |
|
|
return "N/A" |
|
|
dt = pd.to_datetime(max_time) |
|
|
return dt.strftime("%b %d, %Y") |
|
|
except Exception: |
|
|
return "N/A" |
|
|
|
|
|
|
|
|
def load_parquet_to_duckdb(con, parquet_url, view_name): |
|
|
""" |
|
|
Loads a parquet file from a remote URL into DuckDB as a view. |
|
|
Returns (start_dt, end_dt) for the 'time' column. |
|
|
""" |
|
|
|
|
|
con.execute("INSTALL httpfs;") |
|
|
con.execute("LOAD httpfs;") |
|
|
|
|
|
|
|
|
con.execute(f""" |
|
|
CREATE OR REPLACE VIEW {view_name} AS |
|
|
SELECT * FROM read_parquet('{parquet_url}') |
|
|
""") |
|
|
|
|
|
|
|
|
time_range = con.execute( |
|
|
f"SELECT MIN(time) as min_time, MAX(time) as max_time FROM {view_name}" |
|
|
).fetchdf() |
|
|
start_dt = pd.to_datetime(time_range["min_time"].iloc[0]) |
|
|
end_dt = pd.to_datetime(time_range["max_time"].iloc[0]) |
|
|
return start_dt, end_dt |
|
|
|
|
|
|
|
|
|
|
|
con = duckdb.connect(database=":memory:", read_only=False) |
|
|
|
|
|
|
|
|
con.execute("SET enable_http_metadata_cache = false;") |
|
|
con.execute("SET enable_object_cache = false;") |
|
|
|
|
|
|
|
|
HF_DATASET_ID = "mmpr/open_model_evolution_data" |
|
|
hf_parquet_url_1 = "https://huggingface.co/datasets/mmpr/open_model_evolution_data/resolve/main/all_downloads_with_annotations.parquet" |
|
|
hf_parquet_url_2 = "https://huggingface.co/datasets/mmpr/open_model_evolution_data/resolve/main/one_year_rolling.parquet" |
|
|
|
|
|
print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}") |
|
|
try: |
|
|
overall_start_time = time.time() |
|
|
|
|
|
|
|
|
start_dt, end_dt = load_parquet_to_duckdb(con, hf_parquet_url_1, "all_downloads") |
|
|
|
|
|
start_dt2, end_dt2 = load_parquet_to_duckdb( |
|
|
con, hf_parquet_url_2, "one_year_rolling" |
|
|
) |
|
|
|
|
|
msg = f"Successfully connected to datasets in {time.time() - overall_start_time:.2f}s." |
|
|
print(msg) |
|
|
except Exception as e: |
|
|
err_msg = f"Failed to load dataset(s). Error: {e}" |
|
|
print(err_msg) |
|
|
raise |
|
|
|
|
|
|
|
|
start_ts = int(start_dt.timestamp()) |
|
|
end_ts = int(end_dt.timestamp()) |
|
|
|
|
|
|
|
|
def ordinal(n): |
|
|
|
|
|
if 10 <= n % 100 <= 20: |
|
|
suffix = "th" |
|
|
else: |
|
|
suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th") |
|
|
return f"{n}{suffix}" |
|
|
|
|
|
|
|
|
def format_date(dt): |
|
|
|
|
|
return dt.strftime("%b") + f" {ordinal(dt.day)}, {dt.year}" |
|
|
|
|
|
|
|
|
marks = [] |
|
|
|
|
|
marks.append({"value": start_ts, "label": start_dt.strftime("%b %Y")}) |
|
|
|
|
|
for yr in range(start_dt.year, end_dt.year + 1): |
|
|
yr_ts = int(pd.Timestamp(year=yr, month=1, day=1).timestamp()) |
|
|
start_yr = int(pd.Timestamp(year=start_dt.year, month=1, day=1).timestamp()) |
|
|
if yr_ts != start_yr and yr_ts != end_ts: |
|
|
marks.append({"value": yr_ts, "label": str(yr)}) |
|
|
|
|
|
marks.append({"value": end_ts, "label": end_dt.strftime("%b %Y")}) |
|
|
|
|
|
|
|
|
def get_thumb_labels(values): |
|
|
|
|
|
distance = abs(values[1] - values[0]) |
|
|
close = distance < 4 * 30 * 86400 |
|
|
|
|
|
label_style = { |
|
|
"background": "#fff", |
|
|
"color": "#082030", |
|
|
"fontWeight": "bold", |
|
|
"fontSize": "13px", |
|
|
"borderRadius": "8px", |
|
|
"padding": "2px 8px", |
|
|
"boxShadow": "0 1px 4px rgba(8,32,48,0.10)", |
|
|
"position": "absolute", |
|
|
"left": "50%", |
|
|
"transform": "translateX(-50%)", |
|
|
"whiteSpace": "nowrap", |
|
|
"zIndex": 100, |
|
|
} |
|
|
|
|
|
if close: |
|
|
|
|
|
style_top_1 = label_style.copy() |
|
|
style_top_1["top"] = "-38px" |
|
|
style_top_2 = label_style.copy() |
|
|
style_top_2["top"] = "14px" |
|
|
return [ |
|
|
html.Div( |
|
|
format_date(pd.to_datetime(values[0], unit="s")), |
|
|
style=style_top_1, |
|
|
), |
|
|
html.Div( |
|
|
format_date(pd.to_datetime(values[1], unit="s")), |
|
|
style=style_top_2, |
|
|
), |
|
|
] |
|
|
else: |
|
|
|
|
|
style_top_1 = label_style.copy() |
|
|
style_top_1["top"] = "14px" |
|
|
style_top_2 = label_style.copy() |
|
|
style_top_2["top"] = "14px" |
|
|
return [ |
|
|
html.Div( |
|
|
format_date(pd.to_datetime(values[0], unit="s")), |
|
|
style=style_top_1, |
|
|
), |
|
|
html.Div( |
|
|
format_date(pd.to_datetime(values[1], unit="s")), |
|
|
style=style_top_2, |
|
|
), |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
time_slider = dmc.RangeSlider( |
|
|
id="time-slider", |
|
|
min=start_ts, |
|
|
max=end_ts, |
|
|
value=[ |
|
|
start_ts, |
|
|
end_ts, |
|
|
], |
|
|
step=24 * 60 * 60, |
|
|
color="#AC482A", |
|
|
size="md", |
|
|
radius="xl", |
|
|
marks=marks, |
|
|
style={"width": "95%", "paddingLeft": "60px"}, |
|
|
label=None, |
|
|
showLabelOnHover=False, |
|
|
labelTransitionProps={"transition": "fade", "duration": 150}, |
|
|
thumbChildren=get_thumb_labels([start_ts, end_ts]), |
|
|
) |
|
|
|
|
|
|
|
|
app.layout = dmc.MantineProvider( |
|
|
theme={ |
|
|
"colorScheme": "light", |
|
|
"primaryColor": "blue", |
|
|
"fontFamily": "Inter, sans-serif", |
|
|
}, |
|
|
children=[ |
|
|
dcc.Store(id="selected-view", data="all_downloads"), |
|
|
dcc.Store(id="model-attribution-type", data="uploader"), |
|
|
html.Div( |
|
|
[ |
|
|
|
|
|
html.Div( |
|
|
[ |
|
|
html.Div( |
|
|
[ |
|
|
html.Span( |
|
|
[ |
|
|
html.Span( |
|
|
className="live-dot", |
|
|
), |
|
|
html.Span( |
|
|
"LIVE", |
|
|
className="live-label", |
|
|
), |
|
|
], |
|
|
className="live-row", |
|
|
), |
|
|
html.Span( |
|
|
f"Last updated: {get_last_updated()}", |
|
|
className="last-updated", |
|
|
), |
|
|
], |
|
|
className="header-status-row", |
|
|
), |
|
|
html.Div( |
|
|
[ |
|
|
html.A( |
|
|
children=[ |
|
|
html.Img( |
|
|
src="assets/images/dpi.svg", |
|
|
className="header-logo-img", |
|
|
), |
|
|
"Data Provenance Initiative", |
|
|
], |
|
|
href="https://www.dataprovenance.org/", |
|
|
target="_blank", |
|
|
className="no-bg-link header-link", |
|
|
), |
|
|
html.A( |
|
|
children=[ |
|
|
html.Img( |
|
|
src="assets/images/hf.svg", |
|
|
className="header-logo-img", |
|
|
), |
|
|
html.Span( |
|
|
"Hugging Face", |
|
|
className="hf-brand-text", |
|
|
), |
|
|
], |
|
|
href="https://huggingface.co/", |
|
|
target="_blank", |
|
|
className="no-bg-link header-link", |
|
|
), |
|
|
html.A( |
|
|
children=[ |
|
|
html.Span( |
|
|
"Read the paper", |
|
|
className="paper-text", |
|
|
), |
|
|
], |
|
|
href="https://www.dataprovenance.org/economies-of-open-intelligence.pdf", |
|
|
target="_blank", |
|
|
className="no-bg-link header-link paper-link", |
|
|
), |
|
|
], |
|
|
className="header-links-row", |
|
|
), |
|
|
], |
|
|
style={ |
|
|
"display": "flex", |
|
|
"justifyContent": "space-between", |
|
|
"alignItems": "center", |
|
|
"padding": "18px 24px", |
|
|
"gap": "24px", |
|
|
"backgroundColor": "#082030", |
|
|
}, |
|
|
className="responsive-header", |
|
|
), |
|
|
html.Div( |
|
|
children=[ |
|
|
dmc.Alert( |
|
|
|
|
|
icon=DashIconify( |
|
|
icon="mdi:information-outline", |
|
|
width=18, |
|
|
height=18, |
|
|
style={"color": "#1A5F8D"}, |
|
|
), |
|
|
children=[ |
|
|
"Note: This dashboard uses ", |
|
|
html.A( |
|
|
"public Hugging Face", |
|
|
href="https://huggingface.co/datasets/hfmlsoc/hub_weekly_snapshots", |
|
|
target="_blank", |
|
|
style={ |
|
|
"color": "#1A5F8D", |
|
|
"fontWeight": "bold", |
|
|
"textDecoration": "underline", |
|
|
}, |
|
|
), |
|
|
" download data, which is less precise than data analyzed in the paper.", |
|
|
], |
|
|
color="blue", |
|
|
radius="md", |
|
|
variant="light", |
|
|
withCloseButton=True, |
|
|
style={ |
|
|
"marginTop": "16px", |
|
|
"marginBottom": "8px", |
|
|
"fontSize": "15px", |
|
|
"fontWeight": "500", |
|
|
"marginLeft": "auto", |
|
|
"marginRight": "auto", |
|
|
}, |
|
|
), |
|
|
html.Span( |
|
|
"The Open Model Leaderboard", |
|
|
style={ |
|
|
"fontSize": 40, |
|
|
"fontWeight": "700", |
|
|
"textAlign": "center", |
|
|
"marginTop": "20px", |
|
|
"marginBottom": "20px", |
|
|
}, |
|
|
), |
|
|
], |
|
|
style={ |
|
|
"display": "flex", |
|
|
"flexDirection": "column", |
|
|
"alignItems": "center", |
|
|
"justifyContent": "center", |
|
|
"gap": "12px", |
|
|
"marginTop": "20px", |
|
|
"marginBottom": "20px", |
|
|
}, |
|
|
className="responsive-title-row", |
|
|
), |
|
|
html.Div( |
|
|
children=[ |
|
|
"This leaderboard assesses concentrations of power in the open model ecosystem through ranking user downloads across three groups: countries, developers, and models. Explore how user downloads of models are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face. This dashboard accompanies the paper titled ", |
|
|
html.A( |
|
|
"Economies of Open Intelligence: Tracing Power & Participation in the Model Ecosystem.", |
|
|
href="https://www.dataprovenance.org/economies-of-open-intelligence.pdf", |
|
|
target="_blank", |
|
|
style={ |
|
|
"color": "#AC482A", |
|
|
"fontWeight": "700", |
|
|
"textDecoration": "underline", |
|
|
}, |
|
|
), |
|
|
], |
|
|
style={ |
|
|
"fontSize": 14, |
|
|
"marginTop": 18, |
|
|
"marginBottom": 12, |
|
|
"marginLeft": 100, |
|
|
"marginRight": 100, |
|
|
"textAlign": "center", |
|
|
}, |
|
|
className="responsive-intro", |
|
|
), |
|
|
html.Div( |
|
|
children=[ |
|
|
html.Div( |
|
|
[ |
|
|
html.Div( |
|
|
html.Span( |
|
|
[ |
|
|
"Download View", |
|
|
dmc.HoverCard( |
|
|
width=260, |
|
|
shadow="md", |
|
|
position="top", |
|
|
children=[ |
|
|
dmc.HoverCardTarget( |
|
|
html.Span( |
|
|
DashIconify( |
|
|
icon="mdi:information-outline", |
|
|
width=16, |
|
|
height=16, |
|
|
style={ |
|
|
"marginLeft": "6px", |
|
|
"color": "#AC482A", |
|
|
"verticalAlign": "middle", |
|
|
}, |
|
|
), |
|
|
style={"cursor": "pointer"}, |
|
|
) |
|
|
), |
|
|
dmc.HoverCardDropdown( |
|
|
dmc.Text( |
|
|
"We believe this filter isolates more authentic usage, mitigating the impact of automatic software downloads for older models.", |
|
|
size="sm", |
|
|
style={"maxWidth": "240px"}, |
|
|
) |
|
|
), |
|
|
], |
|
|
), |
|
|
], |
|
|
className="filter-label-row", |
|
|
), |
|
|
className="filter-label-container", |
|
|
), |
|
|
html.Div( |
|
|
[ |
|
|
dmc.SegmentedControl( |
|
|
id="segmented", |
|
|
value="all-downloads", |
|
|
color="#AC482A", |
|
|
transitionDuration=200, |
|
|
data=[ |
|
|
{ |
|
|
"value": "all-downloads", |
|
|
"label": "All Downloads", |
|
|
}, |
|
|
{ |
|
|
"value": "filtered-downloads", |
|
|
"label": html.Span( |
|
|
["Filtered Downloads"] |
|
|
), |
|
|
}, |
|
|
], |
|
|
mb=10, |
|
|
), |
|
|
], |
|
|
className="filter-segmented-row", |
|
|
), |
|
|
html.Div( |
|
|
"Choose whether to count all downloads, or only downloads up to one year from model creation.", |
|
|
className="filter-description", |
|
|
), |
|
|
html.Div( |
|
|
[ |
|
|
html.Div( |
|
|
"Model Attribution", |
|
|
className="filter-label", |
|
|
), |
|
|
dmc.SegmentedControl( |
|
|
id="model-attribution-segmented", |
|
|
value="uploader", |
|
|
color="#AC482A", |
|
|
transitionDuration=200, |
|
|
data=[ |
|
|
{ |
|
|
"value": "uploader", |
|
|
"label": "Model Uploader", |
|
|
}, |
|
|
{ |
|
|
"value": "original_creator", |
|
|
"label": "Original Model Creator", |
|
|
}, |
|
|
], |
|
|
mb=10, |
|
|
), |
|
|
html.Div( |
|
|
"Toggle between having downloads attributed to the account that uploaded the model, or the account that uploaded the model that this was originally derived from.", |
|
|
className="filter-description", |
|
|
), |
|
|
], |
|
|
style={"marginTop": "10px"}, |
|
|
), |
|
|
html.Span( |
|
|
id="global-toggle-status", |
|
|
className="global-toggle-status", |
|
|
), |
|
|
], |
|
|
className="main-content-left", |
|
|
), |
|
|
html.Div( |
|
|
[ |
|
|
html.Div( |
|
|
"Download Date Range", |
|
|
className="filter-label", |
|
|
), |
|
|
time_slider, |
|
|
html.Div( |
|
|
"Adjust the time range to filter leaderboard results by when models were downloaded by users.", |
|
|
className="filter-description filter-description-margin", |
|
|
), |
|
|
html.Div( |
|
|
[ |
|
|
html.Div( |
|
|
[ |
|
|
DashIconify( |
|
|
icon="mdi:lightbulb-on-outline", |
|
|
width=20, |
|
|
height=20, |
|
|
style={ |
|
|
"marginRight": "8px", |
|
|
"color": "#082030", |
|
|
}, |
|
|
), |
|
|
html.Span("Tip"), |
|
|
], |
|
|
className="tip-title", |
|
|
), |
|
|
html.Div( |
|
|
[ |
|
|
"Try switching between ", |
|
|
html.Span( |
|
|
"All Downloads", |
|
|
className="tip-highlight", |
|
|
), |
|
|
" and ", |
|
|
html.Span( |
|
|
"Filtered Downloads", |
|
|
className="tip-highlight", |
|
|
), |
|
|
" to compare net popularity (but many duplicate, unused downloads) versus more immediate interest as models are released. ", |
|
|
"You can also toggle between ", |
|
|
html.Span( |
|
|
"Model Uploader", |
|
|
className="tip-highlight", |
|
|
), |
|
|
" and ", |
|
|
html.Span( |
|
|
"Original Model Creator", |
|
|
className="tip-highlight", |
|
|
), |
|
|
" to see how attribution affects perceived popularity.", |
|
|
], |
|
|
className="tip-description", |
|
|
), |
|
|
], |
|
|
className="tip-section", |
|
|
), |
|
|
], |
|
|
className="main-content-right", |
|
|
), |
|
|
], |
|
|
style={ |
|
|
"display": "flex", |
|
|
"gap": "24px", |
|
|
"padding": "32px", |
|
|
"alignItems": "flex-start", |
|
|
"marginLeft": "100px", |
|
|
"marginRight": "100px", |
|
|
"backgroundColor": "#FFFBF9", |
|
|
"borderRadius": "18px", |
|
|
}, |
|
|
className="responsive-main-content", |
|
|
), |
|
|
html.Div( |
|
|
[ |
|
|
dcc.Tabs( |
|
|
id="leaderboard-tabs", |
|
|
value="Countries", |
|
|
children=[ |
|
|
dcc.Tab( |
|
|
label="Countries", |
|
|
value="Countries", |
|
|
style={ |
|
|
"backgroundColor": "transparent", |
|
|
"border": "none", |
|
|
"padding": "10px 18px", |
|
|
"color": "#6B7280", |
|
|
"fontWeight": "500", |
|
|
}, |
|
|
selected_style={ |
|
|
"backgroundColor": "transparent", |
|
|
"border": "none", |
|
|
"padding": "10px 18px", |
|
|
"fontWeight": "700", |
|
|
"borderBottom": "3px solid #082030", |
|
|
}, |
|
|
children=[ |
|
|
html.Div( |
|
|
children=[ |
|
|
"The country leaderboard shows how downloads are distributed across different nations, highlighting which countries are leading in model usage and adoption. The metadata includes the ", |
|
|
html.Span( |
|
|
"country", className="meta-var" |
|
|
), |
|
|
" and number of ", |
|
|
html.Span( |
|
|
"user downloads", |
|
|
className="meta-var", |
|
|
), |
|
|
".", |
|
|
], |
|
|
className="tab-description", |
|
|
), |
|
|
html.Div( |
|
|
dcc.Loading( |
|
|
id="loading-countries", |
|
|
type="circle", |
|
|
color="#AC482A", |
|
|
children=html.Div( |
|
|
id="top_countries-table" |
|
|
), |
|
|
), |
|
|
className="responsive-table-wrapper", |
|
|
), |
|
|
html.Button( |
|
|
id="top_countries-toggle", |
|
|
children="▼ Show Top 50", |
|
|
n_clicks=0, |
|
|
style={**button_style, "border": "none"}, |
|
|
), |
|
|
], |
|
|
), |
|
|
dcc.Tab( |
|
|
label="Developers", |
|
|
value="Developers", |
|
|
style={ |
|
|
"backgroundColor": "transparent", |
|
|
"border": "none", |
|
|
"padding": "10px 18px", |
|
|
"color": "#6B7280", |
|
|
"fontWeight": "500", |
|
|
}, |
|
|
selected_style={ |
|
|
"backgroundColor": "transparent", |
|
|
"border": "none", |
|
|
"padding": "10px 18px", |
|
|
"fontWeight": "700", |
|
|
"borderBottom": "3px solid #082030", |
|
|
}, |
|
|
children=[ |
|
|
html.Div( |
|
|
children=[ |
|
|
"The developer leaderboard highlights the most influential model creators on Hugging Face, showcasing which developers have garnered the highest download counts for their models. The metadata includes the ", |
|
|
html.Span( |
|
|
"developer", className="meta-var" |
|
|
), |
|
|
", number of ", |
|
|
html.Span( |
|
|
"user downloads", |
|
|
className="meta-var", |
|
|
), |
|
|
", and ", |
|
|
html.Span( |
|
|
"country", className="meta-var" |
|
|
), |
|
|
".", |
|
|
], |
|
|
className="tab-description", |
|
|
), |
|
|
html.Div( |
|
|
dcc.Loading( |
|
|
id="loading-developers", |
|
|
type="circle", |
|
|
color="#AC482A", |
|
|
children=html.Div( |
|
|
id="top_developers-table" |
|
|
), |
|
|
), |
|
|
className="responsive-table-wrapper", |
|
|
), |
|
|
html.Button( |
|
|
id="top_developers-toggle", |
|
|
children="▼ Show Top 50", |
|
|
n_clicks=0, |
|
|
style={**button_style, "border": "none"}, |
|
|
), |
|
|
], |
|
|
), |
|
|
dcc.Tab( |
|
|
label="Models", |
|
|
value="Models", |
|
|
style={ |
|
|
"backgroundColor": "transparent", |
|
|
"border": "none", |
|
|
"padding": "10px 18px", |
|
|
"color": "#6B7280", |
|
|
"fontWeight": "500", |
|
|
}, |
|
|
selected_style={ |
|
|
"backgroundColor": "transparent", |
|
|
"border": "none", |
|
|
"padding": "10px 18px", |
|
|
"fontWeight": "700", |
|
|
"borderBottom": "3px solid #082030", |
|
|
}, |
|
|
children=[ |
|
|
html.Div( |
|
|
children=[ |
|
|
"The model leaderboard ranks individual models based on their download counts, revealing which models are most popular among users on Hugging Face. The metadata includes the ", |
|
|
html.Span( |
|
|
"model name", className="meta-var" |
|
|
), |
|
|
", number of ", |
|
|
html.Span( |
|
|
"user downloads", |
|
|
className="meta-var", |
|
|
), |
|
|
", ", |
|
|
html.Span( |
|
|
"developer", className="meta-var" |
|
|
), |
|
|
", and ", |
|
|
html.Span( |
|
|
"modality", className="meta-var" |
|
|
), |
|
|
" (the input and output types of the model).", |
|
|
], |
|
|
className="tab-description", |
|
|
), |
|
|
html.Div( |
|
|
dcc.Loading( |
|
|
id="loading-models", |
|
|
type="circle", |
|
|
color="#AC482A", |
|
|
children=html.Div( |
|
|
id="top_models-table" |
|
|
), |
|
|
), |
|
|
className="responsive-table-wrapper", |
|
|
), |
|
|
html.Button( |
|
|
id="top_models-toggle", |
|
|
children="▼ Show Top 50", |
|
|
n_clicks=0, |
|
|
style={**button_style, "border": "none"}, |
|
|
), |
|
|
], |
|
|
), |
|
|
], |
|
|
), |
|
|
], |
|
|
style={ |
|
|
"borderRadius": "18px", |
|
|
"padding": "32px", |
|
|
"marginTop": "12px", |
|
|
"marginBottom": "12px", |
|
|
"marginLeft": "50px", |
|
|
"marginRight": "50px", |
|
|
}, |
|
|
className="responsive-tabs", |
|
|
), |
|
|
], |
|
|
style={ |
|
|
"fontFamily": "Inter", |
|
|
"backgroundColor": "#ffffff", |
|
|
"minHeight": "100vh", |
|
|
}, |
|
|
), |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_filtered_top_n_from_duckdb( |
|
|
slider_value, group_col, top_n, view="all_downloads" |
|
|
): |
|
|
""" |
|
|
Query DuckDB directly to get top N entries with metadata |
|
|
This minimizes data transfer by doing aggregation in DuckDB |
|
|
""" |
|
|
|
|
|
time_clause = "" |
|
|
if slider_value and len(slider_value) == 2: |
|
|
start = pd.to_datetime(slider_value[0], unit="s") |
|
|
end = pd.to_datetime(slider_value[1], unit="s") |
|
|
time_clause = f"WHERE time >= '{start}' AND time <= '{end}'" |
|
|
|
|
|
|
|
|
if group_col == "org_country_single": |
|
|
group_expr = """CASE |
|
|
WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' |
|
|
WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online' |
|
|
ELSE org_country_single |
|
|
END""" |
|
|
else: |
|
|
group_expr = group_col |
|
|
|
|
|
|
|
|
|
|
|
if group_col == "derived_author": |
|
|
query = f""" |
|
|
WITH base_data AS ( |
|
|
SELECT |
|
|
{group_expr} AS group_key, |
|
|
CASE |
|
|
WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' |
|
|
WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online' |
|
|
ELSE org_country_single |
|
|
END AS org_country_single, |
|
|
author, |
|
|
derived_author, |
|
|
merged_country_groups_single, |
|
|
merged_modality, |
|
|
downloads, |
|
|
model |
|
|
FROM {view} |
|
|
{time_clause} |
|
|
), |
|
|
|
|
|
-- Create a lookup table for derived_author -> country |
|
|
author_country_lookup AS ( |
|
|
SELECT DISTINCT |
|
|
author, |
|
|
FIRST_VALUE(org_country_single) OVER (PARTITION BY author ORDER BY downloads DESC) AS author_country |
|
|
FROM base_data |
|
|
WHERE author IS NOT NULL |
|
|
), |
|
|
|
|
|
total_downloads_cte AS ( |
|
|
SELECT SUM(downloads) AS total_downloads_all |
|
|
FROM base_data |
|
|
), |
|
|
|
|
|
top_items AS ( |
|
|
SELECT |
|
|
b.group_key AS name, |
|
|
SUM(b.downloads) AS total_downloads, |
|
|
ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total, |
|
|
COALESCE(acl.author_country, ANY_VALUE(b.org_country_single)) AS org_country_single, |
|
|
ANY_VALUE(b.author) AS author, |
|
|
ANY_VALUE(b.derived_author) AS derived_author, |
|
|
ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single, |
|
|
ANY_VALUE(b.merged_modality) AS merged_modality, |
|
|
ANY_VALUE(b.model) AS model |
|
|
FROM base_data b |
|
|
CROSS JOIN total_downloads_cte t |
|
|
LEFT JOIN author_country_lookup acl ON b.group_key = acl.author |
|
|
GROUP BY b.group_key, acl.author_country, t.total_downloads_all |
|
|
) |
|
|
|
|
|
SELECT * |
|
|
FROM top_items |
|
|
ORDER BY total_downloads DESC |
|
|
LIMIT {top_n}; |
|
|
""" |
|
|
else: |
|
|
query = f""" |
|
|
WITH base_data AS ( |
|
|
SELECT |
|
|
{group_expr} AS group_key, |
|
|
CASE |
|
|
WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' |
|
|
WHEN org_country_single IN ('International', 'Online') THEN 'International/Online' |
|
|
ELSE org_country_single |
|
|
END AS org_country_single, |
|
|
author, |
|
|
derived_author, |
|
|
merged_country_groups_single, |
|
|
merged_modality, |
|
|
downloads, |
|
|
model |
|
|
FROM {view} |
|
|
{time_clause} |
|
|
), |
|
|
|
|
|
total_downloads_cte AS ( |
|
|
SELECT SUM(downloads) AS total_downloads_all |
|
|
FROM base_data |
|
|
), |
|
|
|
|
|
top_items AS ( |
|
|
SELECT |
|
|
b.group_key AS name, |
|
|
SUM(b.downloads) AS total_downloads, |
|
|
ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total, |
|
|
ANY_VALUE(b.org_country_single) AS org_country_single, |
|
|
ANY_VALUE(b.author) AS author, |
|
|
ANY_VALUE(b.derived_author) AS derived_author, |
|
|
ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single, |
|
|
ANY_VALUE(b.merged_modality) AS merged_modality, |
|
|
ANY_VALUE(b.model) AS model |
|
|
FROM base_data b |
|
|
CROSS JOIN total_downloads_cte t |
|
|
GROUP BY b.group_key, t.total_downloads_all |
|
|
) |
|
|
|
|
|
SELECT * |
|
|
FROM top_items |
|
|
ORDER BY total_downloads DESC |
|
|
LIMIT {top_n}; |
|
|
""" |
|
|
|
|
|
return con.execute(query).fetchdf() |
|
|
|
|
|
|
|
|
def _leaderboard_callback_logic( |
|
|
n_clicks, |
|
|
slider_value, |
|
|
current_label, |
|
|
group_col, |
|
|
filename, |
|
|
default_label="▼ Show Top 50", |
|
|
chip_color="#F0F9FF", |
|
|
view="all_downloads", |
|
|
derived_author_toggle=True, |
|
|
): |
|
|
|
|
|
if current_label is None: |
|
|
current_label = default_label |
|
|
|
|
|
|
|
|
if n_clicks == 0: |
|
|
top_n = 10 |
|
|
new_label = current_label |
|
|
elif "Show Top 50" in current_label: |
|
|
top_n, new_label = 50, "▼ Show Top 100" |
|
|
elif "Show Top 100" in current_label: |
|
|
top_n, new_label = 100, "▲ Show Less" |
|
|
else: |
|
|
top_n, new_label = 10, "▼ Show Top 50" |
|
|
|
|
|
|
|
|
df_filtered = _get_filtered_top_n_from_duckdb( |
|
|
slider_value, group_col, top_n, view=view |
|
|
) |
|
|
|
|
|
|
|
|
if df_filtered is None or df_filtered.empty: |
|
|
msg = html.Div( |
|
|
"No data found in this time range. Try broadening the download date range.", |
|
|
style={"padding": "18px", "fontSize": "16px", "color": "#082030"}, |
|
|
) |
|
|
return msg, new_label |
|
|
|
|
|
|
|
|
df, download_df = get_top_n_leaderboard( |
|
|
df_filtered, group_col, top_n, derived_author_toggle=derived_author_toggle |
|
|
) |
|
|
|
|
|
|
|
|
if df is None or (hasattr(df, "empty") and df.empty): |
|
|
msg = html.Div( |
|
|
"No data found in this time range. Try broadening the download date range.", |
|
|
style={"padding": "18px", "fontSize": "16px", "color": "#082030"}, |
|
|
) |
|
|
return msg, new_label |
|
|
|
|
|
return render_table_content( |
|
|
df, download_df, chip_color=chip_color, filename=filename |
|
|
), new_label |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.callback( |
|
|
Output("model-attribution-type", "data"), |
|
|
Input("model-attribution-segmented", "value"), |
|
|
) |
|
|
def update_model_attribution_type(selected_value): |
|
|
return selected_value |
|
|
|
|
|
|
|
|
|
|
|
@app.callback( |
|
|
Output("top_countries-table", "children"), |
|
|
Output("top_countries-toggle", "children"), |
|
|
Input("top_countries-toggle", "n_clicks"), |
|
|
Input("time-slider", "value"), |
|
|
Input("selected-view", "data"), |
|
|
Input("model-attribution-type", "data"), |
|
|
State("top_countries-toggle", "children"), |
|
|
) |
|
|
def update_top_countries( |
|
|
n_clicks, slider_value, selected_view, attribution_type, current_label |
|
|
): |
|
|
return _leaderboard_callback_logic( |
|
|
n_clicks, |
|
|
slider_value, |
|
|
current_label, |
|
|
group_col="org_country_single", |
|
|
filename="top_countries", |
|
|
default_label="▼ Show Top 50", |
|
|
chip_color="#F0F9FF", |
|
|
view=selected_view, |
|
|
derived_author_toggle=(attribution_type == "uploader"), |
|
|
) |
|
|
|
|
|
|
|
|
@app.callback( |
|
|
Output("top_developers-table", "children"), |
|
|
Output("top_developers-toggle", "children"), |
|
|
Input("top_developers-toggle", "n_clicks"), |
|
|
Input("time-slider", "value"), |
|
|
Input("selected-view", "data"), |
|
|
Input("model-attribution-type", "data"), |
|
|
State("top_developers-toggle", "children"), |
|
|
) |
|
|
def update_top_developers( |
|
|
n_clicks, slider_value, selected_view, attribution_type, current_label |
|
|
): |
|
|
|
|
|
group_col = "derived_author" if attribution_type == "uploader" else "author" |
|
|
return _leaderboard_callback_logic( |
|
|
n_clicks, |
|
|
slider_value, |
|
|
current_label, |
|
|
group_col=group_col, |
|
|
filename="top_developers", |
|
|
default_label="▼ Show Top 50", |
|
|
chip_color="#F0F9FF", |
|
|
view=selected_view, |
|
|
derived_author_toggle=(attribution_type == "uploader"), |
|
|
) |
|
|
|
|
|
|
|
|
@app.callback( |
|
|
Output("top_models-table", "children"), |
|
|
Output("top_models-toggle", "children"), |
|
|
Input("top_models-toggle", "n_clicks"), |
|
|
Input("time-slider", "value"), |
|
|
Input("selected-view", "data"), |
|
|
Input("model-attribution-type", "data"), |
|
|
State("top_models-toggle", "children"), |
|
|
) |
|
|
def update_top_models( |
|
|
n_clicks, slider_value, selected_view, attribution_type, current_label |
|
|
): |
|
|
return _leaderboard_callback_logic( |
|
|
n_clicks, |
|
|
slider_value, |
|
|
current_label, |
|
|
group_col="model", |
|
|
filename="top_models", |
|
|
default_label="▼ Show More", |
|
|
chip_color="#F0F9FF", |
|
|
view=selected_view, |
|
|
derived_author_toggle=(attribution_type == "uploader"), |
|
|
) |
|
|
|
|
|
|
|
|
@app.callback( |
|
|
Output("time-slider", "thumbChildren"), |
|
|
Input("time-slider", "value"), |
|
|
) |
|
|
def update_thumb_labels(values): |
|
|
return get_thumb_labels(values) |
|
|
|
|
|
|
|
|
|
|
|
@app.callback( |
|
|
Output("selected-view", "data"), |
|
|
Input("segmented", "value"), |
|
|
) |
|
|
def update_selected_view(seg_value): |
|
|
if seg_value == "filtered-downloads": |
|
|
return "one_year_rolling" |
|
|
return "all_downloads" |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.run(debug=True) |
|
|
|