emsesc commited on
Commit
3cceb68
·
1 Parent(s): 35ffa10

Add derived author and text

Browse files
Files changed (2) hide show
  1. app.py +43 -26
  2. graphs/leaderboard.py +11 -7
app.py CHANGED
@@ -174,6 +174,7 @@ app.layout = dmc.MantineProvider(
174
  },
175
  children=[
176
  dcc.Store(id="selected-view", data="all_downloads"),
 
177
  html.Div(
178
  [
179
  # Header
@@ -184,7 +185,7 @@ app.layout = dmc.MantineProvider(
184
  html.Div(
185
  [
186
  html.Div(
187
- children="Visualizing the Open Model Ecosystem",
188
  style={
189
  "fontSize": 22,
190
  "fontWeight": "700",
@@ -192,7 +193,7 @@ app.layout = dmc.MantineProvider(
192
  },
193
  ),
194
  html.Div(
195
- children="An interactive dashboard to explore trends in open models on Hugging Face",
196
  style={
197
  "fontSize": 13,
198
  "marginTop": 6,
@@ -309,7 +310,7 @@ app.layout = dmc.MantineProvider(
309
  # Intro / description below header (kept but styled to match layout)
310
  # Title
311
  html.Div(
312
- children="Model Leaderboard",
313
  style={
314
  "fontSize": 40,
315
  "fontWeight": "700",
@@ -319,7 +320,7 @@ app.layout = dmc.MantineProvider(
319
  },
320
  ),
321
  html.Div(
322
- children="The model leaderboard assesses concentrations of power across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
323
  style={
324
  "fontSize": 14,
325
  "marginTop": 18,
@@ -335,7 +336,7 @@ app.layout = dmc.MantineProvider(
335
  html.Div(
336
  [
337
  html.Div(
338
- "Select Window",
339
  style={
340
  "fontWeight": "700",
341
  "marginBottom": 8,
@@ -360,7 +361,7 @@ app.layout = dmc.MantineProvider(
360
  mb=10,
361
  ),
362
  html.Div(
363
- "Choose whether to view all downloads or only filtered downloads for the leaderboard.",
364
  style={
365
  "fontSize": 13,
366
  "color": "#555",
@@ -371,7 +372,7 @@ app.layout = dmc.MantineProvider(
371
  html.Div(
372
  [
373
  html.Div(
374
- "Select Mode",
375
  style={
376
  "fontWeight": "700",
377
  "marginBottom": 8,
@@ -379,13 +380,14 @@ app.layout = dmc.MantineProvider(
379
  },
380
  ),
381
  dmc.Switch(
 
382
  color="#AC482A",
383
  label="Derived Authors",
384
  checked=True,
385
  mb=10,
386
  ),
387
  html.Div(
388
- "Switch between absolute numbers and relative percentages for leaderboard values.",
389
  style={
390
  "fontSize": 13,
391
  "color": "#555",
@@ -418,7 +420,7 @@ app.layout = dmc.MantineProvider(
418
  ),
419
  time_slider,
420
  html.Div(
421
- "Adjust the time range to filter leaderboard results by model release date.",
422
  style={
423
  "fontSize": 13,
424
  "color": "#555",
@@ -453,12 +455,10 @@ app.layout = dmc.MantineProvider(
453
  html.Span("All Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
454
  " and ",
455
  html.Span("Filtered Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
456
- " to compare ecosystem-wide vs. curated model trends. ",
457
- "You can also toggle between ",
458
- html.Span("Absolute", style={"fontWeight": "600", "color": "#AC482A"}),
459
- " and ",
460
- html.Span("Relative", style={"fontWeight": "600", "color": "#AC482A"}),
461
- " to see raw counts or percentages."
462
  ],
463
  style={
464
  "fontSize": 13,
@@ -672,12 +672,12 @@ def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view="all_do
672
  SELECT
673
  {group_col},
674
  CASE
675
- WHEN org_country_single = 'HF' THEN 'United States of America'
676
- WHEN org_country_single = 'International' THEN 'International/Online'
677
- WHEN org_country_single = 'Online' THEN 'International/Online'
678
  ELSE org_country_single
679
  END AS org_country_single,
680
  author,
 
681
  merged_country_groups_single,
682
  merged_modality,
683
  downloads,
@@ -701,6 +701,7 @@ def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view="all_do
701
  -- Pick first non-null metadata values for reference
702
  ANY_VALUE(b.org_country_single) AS org_country_single,
703
  ANY_VALUE(b.author) AS author,
 
704
  ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
705
  ANY_VALUE(b.merged_modality) AS merged_modality,
706
  ANY_VALUE(b.model) AS model
@@ -727,6 +728,7 @@ def _leaderboard_callback_logic(
727
  default_label="▼ Show Top 50",
728
  chip_color="#F0F9FF",
729
  view="all_downloads",
 
730
  ):
731
  # Normalize label on first load
732
  if current_label is None:
@@ -746,8 +748,8 @@ def _leaderboard_callback_logic(
746
  # Get filtered and aggregated data directly from DuckDB
747
  df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view=view)
748
 
749
- # Process the already-filtered data
750
- df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n)
751
  return render_table_content(
752
  df, download_df, chip_color=chip_color, filename=filename
753
  ), new_label
@@ -756,6 +758,15 @@ def _leaderboard_callback_logic(
756
  # -- end helpers --
757
 
758
 
 
 
 
 
 
 
 
 
 
759
  # Callbacks for interactivity (modularized)
760
  @app.callback(
761
  Output("top_countries-table", "children"),
@@ -763,9 +774,10 @@ def _leaderboard_callback_logic(
763
  Input("top_countries-toggle", "n_clicks"),
764
  Input("time-slider", "value"),
765
  Input("selected-view", "data"),
 
766
  State("top_countries-toggle", "children"),
767
  )
768
- def update_top_countries(n_clicks, slider_value, selected_view, current_label):
769
  return _leaderboard_callback_logic(
770
  n_clicks,
771
  slider_value,
@@ -775,6 +787,7 @@ def update_top_countries(n_clicks, slider_value, selected_view, current_label):
775
  default_label="▼ Show Top 50",
776
  chip_color="#F0F9FF",
777
  view=selected_view,
 
778
  )
779
 
780
 
@@ -784,18 +797,22 @@ def update_top_countries(n_clicks, slider_value, selected_view, current_label):
784
  Input("top_developers-toggle", "n_clicks"),
785
  Input("time-slider", "value"),
786
  Input("selected-view", "data"),
 
787
  State("top_developers-toggle", "children"),
788
  )
789
- def update_top_developers(n_clicks, slider_value, selected_view, current_label):
 
 
790
  return _leaderboard_callback_logic(
791
  n_clicks,
792
  slider_value,
793
  current_label,
794
- group_col="author",
795
  filename="top_developers",
796
  default_label="▼ Show Top 50",
797
  chip_color="#F0F9FF",
798
  view=selected_view,
 
799
  )
800
 
801
 
@@ -805,9 +822,10 @@ def update_top_developers(n_clicks, slider_value, selected_view, current_label):
805
  Input("top_models-toggle", "n_clicks"),
806
  Input("time-slider", "value"),
807
  Input("selected-view", "data"),
 
808
  State("top_models-toggle", "children"),
809
  )
810
- def update_top_models(n_clicks, slider_value, selected_view, current_label):
811
  return _leaderboard_callback_logic(
812
  n_clicks,
813
  slider_value,
@@ -817,6 +835,7 @@ def update_top_models(n_clicks, slider_value, selected_view, current_label):
817
  default_label="▼ Show More",
818
  chip_color="#F0F9FF",
819
  view=selected_view,
 
820
  )
821
 
822
 
@@ -840,5 +859,3 @@ def update_selected_view(seg_value):
840
  # Run the app
841
  if __name__ == "__main__":
842
  app.run(debug=True)
843
- if __name__ == "__main__":
844
- app.run(debug=True)
 
174
  },
175
  children=[
176
  dcc.Store(id="selected-view", data="all_downloads"),
177
+ dcc.Store(id="derived-author-toggle", data=True), # Store for toggle state
178
  html.Div(
179
  [
180
  # Header
 
185
  html.Div(
186
  [
187
  html.Div(
188
+ children="Economies of Open Intelligence",
189
  style={
190
  "fontSize": 22,
191
  "fontWeight": "700",
 
193
  },
194
  ),
195
  html.Div(
196
+ children="Tracing Power & Participation in the Model Ecosystem",
197
  style={
198
  "fontSize": 13,
199
  "marginTop": 6,
 
310
  # Intro / description below header (kept but styled to match layout)
311
  # Title
312
  html.Div(
313
+ children="The Open Model Leaderboard",
314
  style={
315
  "fontSize": 40,
316
  "fontWeight": "700",
 
320
  },
321
  ),
322
  html.Div(
323
+ children="This leaderboard assesses concentrations of power in the open model ecosystem across three hierarchies: countries, developers, and models. Explore how downloads are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face.",
324
  style={
325
  "fontSize": 14,
326
  "marginTop": 18,
 
336
  html.Div(
337
  [
338
  html.Div(
339
+ "Select Download View",
340
  style={
341
  "fontWeight": "700",
342
  "marginBottom": 8,
 
361
  mb=10,
362
  ),
363
  html.Div(
364
+ "Choose whether to view all downloads or only those within one year of the model's creation date.",
365
  style={
366
  "fontSize": 13,
367
  "color": "#555",
 
372
  html.Div(
373
  [
374
  html.Div(
375
+ "Select Author Type",
376
  style={
377
  "fontWeight": "700",
378
  "marginBottom": 8,
 
380
  },
381
  ),
382
  dmc.Switch(
383
+ id="derived-author-switch", # <-- add id
384
  color="#AC482A",
385
  label="Derived Authors",
386
  checked=True,
387
  mb=10,
388
  ),
389
  html.Div(
390
+ "Toggle between viewing downloads by original authors or derived authors (those who forked or adapted models).",
391
  style={
392
  "fontSize": 13,
393
  "color": "#555",
 
420
  ),
421
  time_slider,
422
  html.Div(
423
+ "Adjust the time range to filter leaderboard results by model download times.",
424
  style={
425
  "fontSize": 13,
426
  "color": "#555",
 
455
  html.Span("All Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
456
  " and ",
457
  html.Span("Filtered Downloads", style={"fontWeight": "600", "color": "#AC482A"}),
458
+ " to compare overall popularity versus early interest after model release. ",
459
+ "You can also toggle ON ",
460
+ html.Span("Derived Authors", style={"fontWeight": "600", "color": "#AC482A"}),
461
+ " to see how derivative works contribute to developer influence.",
 
 
462
  ],
463
  style={
464
  "fontSize": 13,
 
672
  SELECT
673
  {group_col},
674
  CASE
675
+ WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
676
+ WHEN org_country_single IN ('International', 'Online') THEN 'International/Online'
 
677
  ELSE org_country_single
678
  END AS org_country_single,
679
  author,
680
+ derived_author,
681
  merged_country_groups_single,
682
  merged_modality,
683
  downloads,
 
701
  -- Pick first non-null metadata values for reference
702
  ANY_VALUE(b.org_country_single) AS org_country_single,
703
  ANY_VALUE(b.author) AS author,
704
+ ANY_VALUE(b.derived_author) AS derived_author,
705
  ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
706
  ANY_VALUE(b.merged_modality) AS merged_modality,
707
  ANY_VALUE(b.model) AS model
 
728
  default_label="▼ Show Top 50",
729
  chip_color="#F0F9FF",
730
  view="all_downloads",
731
+ derived_author_toggle=True,
732
  ):
733
  # Normalize label on first load
734
  if current_label is None:
 
748
  # Get filtered and aggregated data directly from DuckDB
749
  df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n, view=view)
750
 
751
+ # Process the already-filtered data - pass derived_author_toggle
752
+ df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n, derived_author_toggle=derived_author_toggle)
753
  return render_table_content(
754
  df, download_df, chip_color=chip_color, filename=filename
755
  ), new_label
 
758
  # -- end helpers --
759
 
760
 
761
+ # --- Callback to store derived author toggle state ---
762
+ @app.callback(
763
+ Output("derived-author-toggle", "data"),
764
+ Input("derived-author-switch", "checked"),
765
+ )
766
+ def update_derived_author_toggle(checked):
767
+ return checked
768
+
769
+
770
  # Callbacks for interactivity (modularized)
771
  @app.callback(
772
  Output("top_countries-table", "children"),
 
774
  Input("top_countries-toggle", "n_clicks"),
775
  Input("time-slider", "value"),
776
  Input("selected-view", "data"),
777
+ Input("derived-author-toggle", "data"),
778
  State("top_countries-toggle", "children"),
779
  )
780
+ def update_top_countries(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
781
  return _leaderboard_callback_logic(
782
  n_clicks,
783
  slider_value,
 
787
  default_label="▼ Show Top 50",
788
  chip_color="#F0F9FF",
789
  view=selected_view,
790
+ derived_author_toggle=derived_author_toggle,
791
  )
792
 
793
 
 
797
  Input("top_developers-toggle", "n_clicks"),
798
  Input("time-slider", "value"),
799
  Input("selected-view", "data"),
800
+ Input("derived-author-toggle", "data"),
801
  State("top_developers-toggle", "children"),
802
  )
803
+ def update_top_developers(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
804
+ # Use derived_author if toggle is True, else author
805
+ group_col = "derived_author" if derived_author_toggle else "author"
806
  return _leaderboard_callback_logic(
807
  n_clicks,
808
  slider_value,
809
  current_label,
810
+ group_col=group_col,
811
  filename="top_developers",
812
  default_label="▼ Show Top 50",
813
  chip_color="#F0F9FF",
814
  view=selected_view,
815
+ derived_author_toggle=derived_author_toggle,
816
  )
817
 
818
 
 
822
  Input("top_models-toggle", "n_clicks"),
823
  Input("time-slider", "value"),
824
  Input("selected-view", "data"),
825
+ Input("derived-author-toggle", "data"),
826
  State("top_models-toggle", "children"),
827
  )
828
+ def update_top_models(n_clicks, slider_value, selected_view, derived_author_toggle, current_label):
829
  return _leaderboard_callback_logic(
830
  n_clicks,
831
  slider_value,
 
835
  default_label="▼ Show More",
836
  chip_color="#F0F9FF",
837
  view=selected_view,
838
+ derived_author_toggle=derived_author_toggle,
839
  )
840
 
841
 
 
859
  # Run the app
860
  if __name__ == "__main__":
861
  app.run(debug=True)
 
 
graphs/leaderboard.py CHANGED
@@ -36,9 +36,11 @@ country_emoji_fallback = {
36
  meta_cols_map = {
37
  "org_country_single": ["org_country_single"],
38
  "author": ["org_country_single", "author", "merged_country_groups_single"],
 
39
  "model": [
40
  "org_country_single",
41
  "author",
 
42
  "merged_country_groups_single",
43
  "merged_modality",
44
  "total_downloads",
@@ -281,7 +283,7 @@ def render_table_content(
281
  )
282
 
283
  # Function to get top N leaderboard (now accepts pandas DataFrame from DuckDB query)
284
- def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
285
  """
286
  Get top N entries for a leaderboard
287
 
@@ -289,6 +291,7 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
289
  filtered_df: Pandas DataFrame (already filtered by time from DuckDB query)
290
  group_col: Column to group by
291
  top_n: Number of top entries to return
 
292
 
293
  Returns:
294
  tuple: (display_df, download_df)
@@ -322,7 +325,6 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
322
  name_data = filtered_df[filtered_df[group_col] == name]
323
  meta_map[name] = {}
324
  download_map[name] = {}
325
-
326
  for col in meta_cols:
327
  if col in name_data.columns:
328
  unique_vals = name_data[col].unique()
@@ -350,8 +352,9 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
350
  flag_emoji = country_emoji_fallback.get(c, "🌍")
351
  chips.append((flag_emoji, c, "country"))
352
 
353
- # Author
354
- for a in meta.get("author", []):
 
355
  icon = company_icon_map.get(a, "")
356
  if icon == "":
357
  if meta.get("merged_country_groups_single", ["User"])[0] != "User":
@@ -428,12 +431,12 @@ def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None, view="all_
428
  SELECT
429
  {group_col},
430
  CASE
431
- WHEN org_country_single = 'HF' THEN 'United States of America'
432
- WHEN org_country_single = 'International' THEN 'International/Online'
433
- WHEN org_country_single = 'Online' THEN 'International/Online'
434
  ELSE org_country_single
435
  END AS org_country_single,
436
  author,
 
437
  merged_country_groups_single,
438
  merged_modality,
439
  downloads,
@@ -457,6 +460,7 @@ def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None, view="all_
457
  -- Pick first non-null metadata values for reference
458
  ANY_VALUE(b.org_country_single) AS org_country_single,
459
  ANY_VALUE(b.author) AS author,
 
460
  ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
461
  ANY_VALUE(b.merged_modality) AS merged_modality,
462
  ANY_VALUE(b.model) AS model
 
36
  meta_cols_map = {
37
  "org_country_single": ["org_country_single"],
38
  "author": ["org_country_single", "author", "merged_country_groups_single"],
39
+ "derived_author": ["org_country_single", "derived_author", "merged_country_groups_single"],
40
  "model": [
41
  "org_country_single",
42
  "author",
43
+ "derived_author",
44
  "merged_country_groups_single",
45
  "merged_modality",
46
  "total_downloads",
 
283
  )
284
 
285
  # Function to get top N leaderboard (now accepts pandas DataFrame from DuckDB query)
286
+ def get_top_n_leaderboard(filtered_df, group_col, top_n=10, derived_author_toggle=True):
287
  """
288
  Get top N entries for a leaderboard
289
 
 
291
  filtered_df: Pandas DataFrame (already filtered by time from DuckDB query)
292
  group_col: Column to group by
293
  top_n: Number of top entries to return
294
+ derived_author_toggle: Whether to use derived_author or author column
295
 
296
  Returns:
297
  tuple: (display_df, download_df)
 
325
  name_data = filtered_df[filtered_df[group_col] == name]
326
  meta_map[name] = {}
327
  download_map[name] = {}
 
328
  for col in meta_cols:
329
  if col in name_data.columns:
330
  unique_vals = name_data[col].unique()
 
352
  flag_emoji = country_emoji_fallback.get(c, "🌍")
353
  chips.append((flag_emoji, c, "country"))
354
 
355
+ # Author - use derived_author_toggle to determine which column
356
+ author_key = "derived_author" if derived_author_toggle else "author"
357
+ for a in meta.get(author_key, []):
358
  icon = company_icon_map.get(a, "")
359
  if icon == "":
360
  if meta.get("merged_country_groups_single", ["User"])[0] != "User":
 
431
  SELECT
432
  {group_col},
433
  CASE
434
+ WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
435
+ WHEN org_country_single IN ('International', 'Online') THEN 'International/Online'
 
436
  ELSE org_country_single
437
  END AS org_country_single,
438
  author,
439
+ derived_author,
440
  merged_country_groups_single,
441
  merged_modality,
442
  downloads,
 
460
  -- Pick first non-null metadata values for reference
461
  ANY_VALUE(b.org_country_single) AS org_country_single,
462
  ANY_VALUE(b.author) AS author,
463
+ ANY_VALUE(b.derived_author) AS derived_author,
464
  ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
465
  ANY_VALUE(b.merged_modality) AS merged_modality,
466
  ANY_VALUE(b.model) AS model