emsesc commited on
Commit
aaa721d
·
1 Parent(s): 927a4de

fixed author attribution issue

Browse files
Files changed (3) hide show
  1. data_utils.py +18 -81
  2. graphs/leaderboard.py +7 -10
  3. layout_components.py +0 -1
data_utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import duckdb
2
  import pandas as pd
3
 
4
- from config import DATASET_ID, HF_PARQUET_URL_1, HF_PARQUET_URL_2
5
 
6
 
7
  def create_fresh_duckdb_with_views(parquet_url_1: str = HF_PARQUET_URL_1, parquet_url_2: str = HF_PARQUET_URL_2):
@@ -62,6 +62,7 @@ def build_leaderboard_query(
62
  end_str=None,
63
  date_str=None,
64
  view="all_downloads",
 
65
  ):
66
  """Build the SQL query string for leaderboard data."""
67
  is_alltime = date_str is not None
@@ -84,89 +85,25 @@ def build_leaderboard_query(
84
  - COALESCE(MAX(CASE WHEN time < '{start_str}' THEN downloadsAllTime END), 0)
85
  AS total_downloads"""
86
 
87
- if group_col == "derived_author":
88
- return f"""
89
- WITH base_data AS (
90
- SELECT
91
- {group_expr} AS group_key,
92
- CASE
93
- WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
94
- WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
95
- ELSE org_country_single
96
- END AS org_country_single,
97
- author,
98
- derived_author,
99
- merged_country_groups_single,
100
- merged_modality,
101
- model,
102
- time,
103
- downloadsAllTime
104
- FROM {view}
105
- {base_where}
106
- ),
107
-
108
- author_country_lookup AS (
109
- SELECT DISTINCT
110
- derived_author,
111
- FIRST_VALUE(org_country_single) OVER (PARTITION BY derived_author ORDER BY downloadsAllTime DESC) AS derived_author_country
112
- FROM base_data
113
- WHERE derived_author IS NOT NULL
114
- ),
115
-
116
- author_merged_country_lookup AS (
117
- SELECT DISTINCT
118
- derived_author,
119
- FIRST_VALUE(merged_country_groups_single) OVER (PARTITION BY derived_author ORDER BY downloadsAllTime DESC) AS derived_author_merged_country
120
- FROM base_data
121
- WHERE derived_author IS NOT NULL
122
- ),
123
-
124
- model_metrics AS (
125
- SELECT
126
- model,
127
- group_key,
128
- ANY_VALUE(org_country_single) AS org_country_single,
129
- ANY_VALUE(author) AS author,
130
- ANY_VALUE(derived_author) AS derived_author,
131
- ANY_VALUE(merged_country_groups_single) AS merged_country_groups_single,
132
- ANY_VALUE(merged_modality) AS merged_modality,
133
- {downloads_calc}
134
- FROM base_data
135
- GROUP BY model, group_key
136
- ),
137
-
138
- total_downloads_cte AS (
139
- SELECT SUM(total_downloads) AS total_downloads_all FROM model_metrics
140
- )
141
-
142
- SELECT
143
- mm.model,
144
- mm.group_key,
145
- acl.derived_author_country AS org_country_single,
146
- amc.derived_author_merged_country AS merged_country_groups_single,
147
- mm.author,
148
- mm.derived_author,
149
- mm.merged_modality,
150
- mm.total_downloads,
151
- CASE WHEN td.total_downloads_all = 0 THEN 0 ELSE ROUND(mm.total_downloads * 100.0 / td.total_downloads_all, 2) END AS percent_of_total
152
- FROM model_metrics mm
153
- LEFT JOIN author_country_lookup acl ON mm.group_key = acl.derived_author
154
- LEFT JOIN author_merged_country_lookup amc ON mm.group_key = amc.derived_author
155
- CROSS JOIN total_downloads_cte td
156
- WHERE mm.total_downloads > 0
157
- ORDER BY mm.total_downloads DESC
158
- LIMIT {top_n * 10};
159
- """
160
 
161
  return f"""
162
  WITH base_data AS (
163
  SELECT
164
  {group_expr} AS group_key,
165
- CASE
166
- WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
167
- WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
168
- ELSE org_country_single
169
- END AS org_country_single,
170
  author,
171
  derived_author,
172
  merged_country_groups_single,
@@ -214,7 +151,7 @@ def build_leaderboard_query(
214
  """
215
 
216
 
217
- def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None, view="all_downloads"):
218
  """Query DuckDB directly to get model-level rows with per-model total_downloads."""
219
  if time_filter and len(time_filter) == 2:
220
  start = pd.to_datetime(time_filter[0], unit="s")
@@ -225,7 +162,7 @@ def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None, view="all_
225
 
226
  start_str = str(start)
227
  end_str = str(end)
228
- query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view)
229
 
230
  conn_local = create_fresh_duckdb_with_views()
231
  try:
 
1
  import duckdb
2
  import pandas as pd
3
 
4
+ from config import HF_PARQUET_URL_1, HF_PARQUET_URL_2
5
 
6
 
7
  def create_fresh_duckdb_with_views(parquet_url_1: str = HF_PARQUET_URL_1, parquet_url_2: str = HF_PARQUET_URL_2):
 
62
  end_str=None,
63
  date_str=None,
64
  view="all_downloads",
65
+ derived_org_toggle=False,
66
  ):
67
  """Build the SQL query string for leaderboard data."""
68
  is_alltime = date_str is not None
 
85
  - COALESCE(MAX(CASE WHEN time < '{start_str}' THEN downloadsAllTime END), 0)
86
  AS total_downloads"""
87
 
88
+ # Determine which org_country column to use
89
+ if derived_org_toggle:
90
+ org_country_case = """CASE
91
+ WHEN derived_org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
92
+ WHEN derived_org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
93
+ ELSE derived_org_country_single
94
+ END"""
95
+ else:
96
+ org_country_case = """CASE
97
+ WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
98
+ WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
99
+ ELSE org_country_single
100
+ END"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  return f"""
103
  WITH base_data AS (
104
  SELECT
105
  {group_expr} AS group_key,
106
+ {org_country_case} AS org_country_single,
 
 
 
 
107
  author,
108
  derived_author,
109
  merged_country_groups_single,
 
151
  """
152
 
153
 
154
+ def get_top_n_from_duckdb(group_col, top_n=10, time_filter=None, view="all_downloads", derived_org_toggle=False):
155
  """Query DuckDB directly to get model-level rows with per-model total_downloads."""
156
  if time_filter and len(time_filter) == 2:
157
  start = pd.to_datetime(time_filter[0], unit="s")
 
162
 
163
  start_str = str(start)
164
  end_str = str(end)
165
+ query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view, derived_org_toggle=derived_org_toggle)
166
 
167
  conn_local = create_fresh_duckdb_with_views()
168
  try:
graphs/leaderboard.py CHANGED
@@ -15,7 +15,7 @@ from helpers import format_large_number
15
  # =============================
16
 
17
  def get_filtered_top_n_from_duckdb(
18
- slider_value, group_col, top_n, view="all_downloads"
19
  ):
20
  """
21
  Query DuckDB to get model-level rows with per-model total_downloads (delta or full)
@@ -42,7 +42,7 @@ def get_filtered_top_n_from_duckdb(
42
  end_str = str(end)
43
 
44
  # Build query using shared function
45
- query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view)
46
 
47
  # execute using the fresh local connection
48
  result_df = local_con.execute(query).fetchdf()
@@ -52,7 +52,7 @@ def get_filtered_top_n_from_duckdb(
52
 
53
 
54
  def get_filtered_top_n_alltime_from_duckdb(
55
- slider_value, group_col, top_n, view="all_downloads"
56
  ):
57
  """
58
  Query DuckDB to get model-level rows with all-time (cumulative) total_downloads at a specific date.
@@ -76,7 +76,7 @@ def get_filtered_top_n_alltime_from_duckdb(
76
  date_str = str(date)
77
 
78
  # Build query using shared function for all-time
79
- query = build_leaderboard_query(group_col, top_n, date_str=date_str, view=view)
80
 
81
  # execute using the fresh local connection
82
  result_df = local_con.execute(query).fetchdf()
@@ -120,11 +120,11 @@ def leaderboard_callback_logic(
120
  # Use all-time query if is_alltime flag is True
121
  if is_alltime:
122
  df_filtered = get_filtered_top_n_alltime_from_duckdb(
123
- slider_value, group_col, top_n, view=view
124
  )
125
  else:
126
  df_filtered = get_filtered_top_n_from_duckdb(
127
- slider_value, group_col, top_n, view=view
128
  )
129
 
130
  # If the SQL query returned no rows, ask user to broaden date range
@@ -557,11 +557,8 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10, derived_author_toggl
557
 
558
  return display_for_render, download_top
559
 
560
-
561
-
562
-
563
  def get_top_n_from_duckdb(
564
- con, group_col, top_n=10, time_filter=None, view="all_downloads"
565
  ):
566
  """
567
  Query DuckDB directly to get model-level rows with per-model total_downloads (delta or full)
 
15
  # =============================
16
 
17
  def get_filtered_top_n_from_duckdb(
18
+ slider_value, group_col, top_n, view="all_downloads", derived_org_toggle=False
19
  ):
20
  """
21
  Query DuckDB to get model-level rows with per-model total_downloads (delta or full)
 
42
  end_str = str(end)
43
 
44
  # Build query using shared function
45
+ query = build_leaderboard_query(group_col, top_n, start_str, end_str, view=view, derived_org_toggle=derived_org_toggle)
46
 
47
  # execute using the fresh local connection
48
  result_df = local_con.execute(query).fetchdf()
 
52
 
53
 
54
  def get_filtered_top_n_alltime_from_duckdb(
55
+ slider_value, group_col, top_n, view="all_downloads", derived_org_toggle=False
56
  ):
57
  """
58
  Query DuckDB to get model-level rows with all-time (cumulative) total_downloads at a specific date.
 
76
  date_str = str(date)
77
 
78
  # Build query using shared function for all-time
79
+ query = build_leaderboard_query(group_col, top_n, date_str=date_str, view=view, derived_org_toggle=derived_org_toggle)
80
 
81
  # execute using the fresh local connection
82
  result_df = local_con.execute(query).fetchdf()
 
120
  # Use all-time query if is_alltime flag is True
121
  if is_alltime:
122
  df_filtered = get_filtered_top_n_alltime_from_duckdb(
123
+ slider_value, group_col, top_n, view=view, derived_org_toggle=derived_author_toggle
124
  )
125
  else:
126
  df_filtered = get_filtered_top_n_from_duckdb(
127
+ slider_value, group_col, top_n, view=view, derived_org_toggle=derived_author_toggle
128
  )
129
 
130
  # If the SQL query returned no rows, ask user to broaden date range
 
557
 
558
  return display_for_render, download_top
559
 
 
 
 
560
  def get_top_n_from_duckdb(
561
+ group_col, top_n=10, time_filter=None, view="all_downloads"
562
  ):
563
  """
564
  Query DuckDB directly to get model-level rows with per-model total_downloads (delta or full)
layout_components.py CHANGED
@@ -4,7 +4,6 @@ from dash_iconify import DashIconify
4
 
5
  from config import BUTTON_STYLE, DARK_BACKGROUND, PRIMARY_COLOR
6
 
7
-
8
  def build_header(last_updated: str) -> html.Div:
9
  """Top header with live badge and partner logos."""
10
  return html.Div(
 
4
 
5
  from config import BUTTON_STYLE, DARK_BACKGROUND, PRIMARY_COLOR
6
 
 
7
  def build_header(last_updated: str) -> html.Div:
8
  """Top header with live badge and partner logos."""
9
  return html.Div(