fixed author attribution issue
Browse files- data_utils.py +18 -81
- graphs/leaderboard.py +7 -10
- layout_components.py +0 -1
data_utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import duckdb
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
from config import
|
| 5 |
|
| 6 |
|
| 7 |
def create_fresh_duckdb_with_views(parquet_url_1: str = HF_PARQUET_URL_1, parquet_url_2: str = HF_PARQUET_URL_2):
|
|
@@ -62,6 +62,7 @@ def build_leaderboard_query(
|
|
| 62 |
end_str=None,
|
| 63 |
date_str=None,
|
| 64 |
view="all_downloads",
|
|
|
|
| 65 |
):
|
| 66 |
"""Build the SQL query string for leaderboard data."""
|
| 67 |
is_alltime = date_str is not None
|
|
@@ -84,89 +85,25 @@ def build_leaderboard_query(
|
|
| 84 |
- COALESCE(MAX(CASE WHEN time < '{start_str}' THEN downloadsAllTime END), 0)
|
| 85 |
AS total_downloads"""
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
merged_modality,
|
| 101 |
-
model,
|
| 102 |
-
time,
|
| 103 |
-
downloadsAllTime
|
| 104 |
-
FROM {view}
|
| 105 |
-
{base_where}
|
| 106 |
-
),
|
| 107 |
-
|
| 108 |
-
author_country_lookup AS (
|
| 109 |
-
SELECT DISTINCT
|
| 110 |
-
derived_author,
|
| 111 |
-
FIRST_VALUE(org_country_single) OVER (PARTITION BY derived_author ORDER BY downloadsAllTime DESC) AS derived_author_country
|
| 112 |
-
FROM base_data
|
| 113 |
-
WHERE derived_author IS NOT NULL
|
| 114 |
-
),
|
| 115 |
-
|
| 116 |
-
author_merged_country_lookup AS (
|
| 117 |
-
SELECT DISTINCT
|
| 118 |
-
derived_author,
|
| 119 |
-
FIRST_VALUE(merged_country_groups_single) OVER (PARTITION BY derived_author ORDER BY downloadsAllTime DESC) AS derived_author_merged_country
|
| 120 |
-
FROM base_data
|
| 121 |
-
WHERE derived_author IS NOT NULL
|
| 122 |
-
),
|
| 123 |
-
|
| 124 |
-
model_metrics AS (
|
| 125 |
-
SELECT
|
| 126 |
-
model,
|
| 127 |
-
group_key,
|
| 128 |
-
ANY_VALUE(org_country_single) AS org_country_single,
|
| 129 |
-
ANY_VALUE(author) AS author,
|
| 130 |
-
ANY_VALUE(derived_author) AS derived_author,
|
| 131 |
-
ANY_VALUE(merged_country_groups_single) AS merged_country_groups_single,
|
| 132 |
-
ANY_VALUE(merged_modality) AS merged_modality,
|
| 133 |
-
{downloads_calc}
|
| 134 |
-
FROM base_data
|
| 135 |
-
GROUP BY model, group_key
|
| 136 |
-
),
|
| 137 |
-
|
| 138 |
-
total_downloads_cte AS (
|
| 139 |
-
SELECT SUM(total_downloads) AS total_downloads_all FROM model_metrics
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
SELECT
|
| 143 |
-
mm.model,
|
| 144 |
-
mm.group_key,
|
| 145 |
-
acl.derived_author_country AS org_country_single,
|
| 146 |
-
amc.derived_author_merged_country AS merged_country_groups_single,
|
| 147 |
-
mm.author,
|
| 148 |
-
mm.derived_author,
|
| 149 |
-
mm.merged_modality,
|
| 150 |
-
mm.total_downloads,
|
| 151 |
-
CASE WHEN td.total_downloads_all = 0 THEN 0 ELSE ROUND(mm.total_downloads * 100.0 / td.total_downloads_all, 2) END AS percent_of_total
|
| 152 |
-
FROM model_metrics mm
|
| 153 |
-
LEFT JOIN author_country_lookup acl ON mm.group_key = acl.derived_author
|
| 154 |
-
LEFT JOIN author_merged_country_lookup amc ON mm.group_key = amc.derived_author
|
| 155 |
-
CROSS JOIN total_downloads_cte td
|
| 156 |
-
WHERE mm.total_downloads > 0
|
| 157 |
-
ORDER BY mm.total_downloads DESC
|
| 158 |
-
LIMIT {top_n * 10};
|
| 159 |
-
"""
|
| 160 |
|
| 161 |
return f"""
|
| 162 |
WITH base_data AS (
|
| 163 |
SELECT
|
| 164 |
{group_expr} AS group_key,
|
| 165 |
-
|
| 166 |
-
WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
|
| 167 |
-
WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
|
| 168 |
-
ELSE org_country_single
|
| 169 |
-
END AS org_country_single,
|
| 170 |
author,
|
| 171 |
derived_author,
|
| 172 |
merged_country_groups_single,
|
|
@@ -214,7 +151,7 @@ def build_leaderboard_query(
|
|
| 214 |
"""
|
| 215 |
|
| 216 |
|
| 217 |
-
def get_top_n_from_duckdb(
|
| 218 |
"""Query DuckDB directly to get model-level rows with per-model total_downloads."""
|
| 219 |
if time_filter and len(time_filter) == 2:
|
| 220 |
start = pd.to_datetime(time_filter[0], unit="s")
|
|
@@ -225,7 +162,7 @@ def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None, view="all_
|
|
| 225 |
|
| 226 |
start_str = str(start)
|
| 227 |
end_str = str(end)
|
| 228 |
-
query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view)
|
| 229 |
|
| 230 |
conn_local = create_fresh_duckdb_with_views()
|
| 231 |
try:
|
|
|
|
| 1 |
import duckdb
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
from config import HF_PARQUET_URL_1, HF_PARQUET_URL_2
|
| 5 |
|
| 6 |
|
| 7 |
def create_fresh_duckdb_with_views(parquet_url_1: str = HF_PARQUET_URL_1, parquet_url_2: str = HF_PARQUET_URL_2):
|
|
|
|
| 62 |
end_str=None,
|
| 63 |
date_str=None,
|
| 64 |
view="all_downloads",
|
| 65 |
+
derived_org_toggle=False,
|
| 66 |
):
|
| 67 |
"""Build the SQL query string for leaderboard data."""
|
| 68 |
is_alltime = date_str is not None
|
|
|
|
| 85 |
- COALESCE(MAX(CASE WHEN time < '{start_str}' THEN downloadsAllTime END), 0)
|
| 86 |
AS total_downloads"""
|
| 87 |
|
| 88 |
+
# Determine which org_country column to use
|
| 89 |
+
if derived_org_toggle:
|
| 90 |
+
org_country_case = """CASE
|
| 91 |
+
WHEN derived_org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
|
| 92 |
+
WHEN derived_org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
|
| 93 |
+
ELSE derived_org_country_single
|
| 94 |
+
END"""
|
| 95 |
+
else:
|
| 96 |
+
org_country_case = """CASE
|
| 97 |
+
WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
|
| 98 |
+
WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
|
| 99 |
+
ELSE org_country_single
|
| 100 |
+
END"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
return f"""
|
| 103 |
WITH base_data AS (
|
| 104 |
SELECT
|
| 105 |
{group_expr} AS group_key,
|
| 106 |
+
{org_country_case} AS org_country_single,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
author,
|
| 108 |
derived_author,
|
| 109 |
merged_country_groups_single,
|
|
|
|
| 151 |
"""
|
| 152 |
|
| 153 |
|
| 154 |
+
def get_top_n_from_duckdb(group_col, top_n=10, time_filter=None, view="all_downloads", derived_org_toggle=False):
|
| 155 |
"""Query DuckDB directly to get model-level rows with per-model total_downloads."""
|
| 156 |
if time_filter and len(time_filter) == 2:
|
| 157 |
start = pd.to_datetime(time_filter[0], unit="s")
|
|
|
|
| 162 |
|
| 163 |
start_str = str(start)
|
| 164 |
end_str = str(end)
|
| 165 |
+
query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view, derived_org_toggle=derived_org_toggle)
|
| 166 |
|
| 167 |
conn_local = create_fresh_duckdb_with_views()
|
| 168 |
try:
|
graphs/leaderboard.py
CHANGED
|
@@ -15,7 +15,7 @@ from helpers import format_large_number
|
|
| 15 |
# =============================
|
| 16 |
|
| 17 |
def get_filtered_top_n_from_duckdb(
|
| 18 |
-
slider_value, group_col, top_n, view="all_downloads"
|
| 19 |
):
|
| 20 |
"""
|
| 21 |
Query DuckDB to get model-level rows with per-model total_downloads (delta or full)
|
|
@@ -42,7 +42,7 @@ def get_filtered_top_n_from_duckdb(
|
|
| 42 |
end_str = str(end)
|
| 43 |
|
| 44 |
# Build query using shared function
|
| 45 |
-
query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view)
|
| 46 |
|
| 47 |
# execute using the fresh local connection
|
| 48 |
result_df = local_con.execute(query).fetchdf()
|
|
@@ -52,7 +52,7 @@ def get_filtered_top_n_from_duckdb(
|
|
| 52 |
|
| 53 |
|
| 54 |
def get_filtered_top_n_alltime_from_duckdb(
|
| 55 |
-
slider_value, group_col, top_n, view="all_downloads"
|
| 56 |
):
|
| 57 |
"""
|
| 58 |
Query DuckDB to get model-level rows with all-time (cumulative) total_downloads at a specific date.
|
|
@@ -76,7 +76,7 @@ def get_filtered_top_n_alltime_from_duckdb(
|
|
| 76 |
date_str = str(date)
|
| 77 |
|
| 78 |
# Build query using shared function for all-time
|
| 79 |
-
query = build_leaderboard_query(group_col, top_n, date_str=date_str, view=view)
|
| 80 |
|
| 81 |
# execute using the fresh local connection
|
| 82 |
result_df = local_con.execute(query).fetchdf()
|
|
@@ -120,11 +120,11 @@ def leaderboard_callback_logic(
|
|
| 120 |
# Use all-time query if is_alltime flag is True
|
| 121 |
if is_alltime:
|
| 122 |
df_filtered = get_filtered_top_n_alltime_from_duckdb(
|
| 123 |
-
slider_value, group_col, top_n, view=view
|
| 124 |
)
|
| 125 |
else:
|
| 126 |
df_filtered = get_filtered_top_n_from_duckdb(
|
| 127 |
-
slider_value, group_col, top_n, view=view
|
| 128 |
)
|
| 129 |
|
| 130 |
# If the SQL query returned no rows, ask user to broaden date range
|
|
@@ -557,11 +557,8 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10, derived_author_toggl
|
|
| 557 |
|
| 558 |
return display_for_render, download_top
|
| 559 |
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
def get_top_n_from_duckdb(
|
| 564 |
-
|
| 565 |
):
|
| 566 |
"""
|
| 567 |
Query DuckDB directly to get model-level rows with per-model total_downloads (delta or full)
|
|
|
|
| 15 |
# =============================
|
| 16 |
|
| 17 |
def get_filtered_top_n_from_duckdb(
|
| 18 |
+
slider_value, group_col, top_n, view="all_downloads", derived_org_toggle=False
|
| 19 |
):
|
| 20 |
"""
|
| 21 |
Query DuckDB to get model-level rows with per-model total_downloads (delta or full)
|
|
|
|
| 42 |
end_str = str(end)
|
| 43 |
|
| 44 |
# Build query using shared function
|
| 45 |
+
query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view, derived_org_toggle=derived_org_toggle)
|
| 46 |
|
| 47 |
# execute using the fresh local connection
|
| 48 |
result_df = local_con.execute(query).fetchdf()
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def get_filtered_top_n_alltime_from_duckdb(
|
| 55 |
+
slider_value, group_col, top_n, view="all_downloads", derived_org_toggle=False
|
| 56 |
):
|
| 57 |
"""
|
| 58 |
Query DuckDB to get model-level rows with all-time (cumulative) total_downloads at a specific date.
|
|
|
|
| 76 |
date_str = str(date)
|
| 77 |
|
| 78 |
# Build query using shared function for all-time
|
| 79 |
+
query = build_leaderboard_query(group_col, top_n, date_str=date_str, view=view, derived_org_toggle=derived_org_toggle)
|
| 80 |
|
| 81 |
# execute using the fresh local connection
|
| 82 |
result_df = local_con.execute(query).fetchdf()
|
|
|
|
| 120 |
# Use all-time query if is_alltime flag is True
|
| 121 |
if is_alltime:
|
| 122 |
df_filtered = get_filtered_top_n_alltime_from_duckdb(
|
| 123 |
+
slider_value, group_col, top_n, view=view, derived_org_toggle=derived_author_toggle
|
| 124 |
)
|
| 125 |
else:
|
| 126 |
df_filtered = get_filtered_top_n_from_duckdb(
|
| 127 |
+
slider_value, group_col, top_n, view=view, derived_org_toggle=derived_author_toggle
|
| 128 |
)
|
| 129 |
|
| 130 |
# If the SQL query returned no rows, ask user to broaden date range
|
|
|
|
| 557 |
|
| 558 |
return display_for_render, download_top
|
| 559 |
|
|
|
|
|
|
|
|
|
|
| 560 |
def get_top_n_from_duckdb(
|
| 561 |
+
group_col, top_n=10, time_filter=None, view="all_downloads"
|
| 562 |
):
|
| 563 |
"""
|
| 564 |
Query DuckDB directly to get model-level rows with per-model total_downloads (delta or full)
|
layout_components.py
CHANGED
|
@@ -4,7 +4,6 @@ from dash_iconify import DashIconify
|
|
| 4 |
|
| 5 |
from config import BUTTON_STYLE, DARK_BACKGROUND, PRIMARY_COLOR
|
| 6 |
|
| 7 |
-
|
| 8 |
def build_header(last_updated: str) -> html.Div:
|
| 9 |
"""Top header with live badge and partner logos."""
|
| 10 |
return html.Div(
|
|
|
|
| 4 |
|
| 5 |
from config import BUTTON_STYLE, DARK_BACKGROUND, PRIMARY_COLOR
|
| 6 |
|
|
|
|
| 7 |
def build_header(last_updated: str) -> html.Div:
|
| 8 |
"""Top header with live badge and partner logos."""
|
| 9 |
return html.Div(
|