emsesc commited on
Commit
6ba1ddc
·
1 Parent(s): 5810b5b

buggy duckdb

Browse files
Files changed (3) hide show
  1. app.py +132 -51
  2. graphs/leaderboard.py +200 -65
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,7 +1,7 @@
1
  from dash import Dash, html, dcc, Input, Output, State
2
  import pandas as pd
3
  import dash_mantine_components as dmc
4
- from datasets import load_dataset
5
  import time
6
  from graphs.leaderboard import (
7
  create_leaderboard,
@@ -13,34 +13,48 @@ from graphs.leaderboard import (
13
  app = Dash()
14
  server = app.server
15
 
16
- # Load parquet file from Hugging Face
 
 
 
17
  HF_DATASET_ID = "emsesc/open_model_evolution_data"
18
- hf_parquet_url = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/"
19
- data_files = {
20
- "filtered_df": hf_parquet_url + "filtered_df.parquet",
21
- # "weekly_df": hf_parquet_url + "weekly_df.parquet",
22
- }
23
- filtered_df = pd.DataFrame()
24
-
25
- print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
26
  try:
27
  overall_start_time = time.time()
28
- dataset = load_dataset("parquet", data_files=data_files)
29
- df = dataset["filtered_df"].to_pandas()
30
- filtered_df = df.copy()
31
 
32
- msg = f"Successfully loaded dataset in {time.time() - overall_start_time:.2f}s."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  print(msg)
34
  except Exception as e:
35
  err_msg = f"Failed to load dataset. Error: {e}"
36
  print(err_msg)
37
-
38
- # List columns for reference
39
- print(filtered_df.columns.tolist())
40
 
41
  # Create a dcc slider for time range selection by year (readable marks)
42
- start_dt = filtered_df["time"].min()
43
- end_dt = filtered_df["time"].max()
44
  start_ts = int(start_dt.timestamp())
45
  end_ts = int(end_dt.timestamp())
46
 
@@ -72,10 +86,6 @@ time_slider = dmc.RangeSlider(
72
  marks=marks,
73
  style={"width": "70%", "margin": "0 auto"},
74
  labelAlwaysOn=False,
75
- # thumbChildren=[
76
- # dmc.Text(id="time-slider-thumb-from-label", size="xs", children="Hello"),
77
- # dmc.Text(id="time-slider-thumb-to-label", size="xs"),
78
- # ]
79
  )
80
 
81
  # App layout
@@ -191,7 +201,7 @@ app.layout = dmc.MantineProvider(
191
  # Intro / description below header (kept but styled to match layout)
192
  # Title
193
  html.Div(
194
- children="Model Leaderboard", # Change this to your desired title
195
  style={
196
  "fontSize": 40,
197
  "fontWeight": "700",
@@ -204,7 +214,7 @@ app.layout = dmc.MantineProvider(
204
  html.Div(
205
  children=[
206
  html.Button(
207
- "Read the paper", # Change this to your desired button text
208
  id="my-button",
209
  style={
210
  "padding": "10px 20px",
@@ -293,7 +303,6 @@ app.layout = dmc.MantineProvider(
293
  "gap": "24px",
294
  "padding": "32px",
295
  "alignItems": "flex-start",
296
- # 'margin': '24px auto 64px', # centered horizontally
297
  "marginLeft": "100px",
298
  "marginRight": "100px",
299
  "backgroundColor": "#FFFBF9",
@@ -305,7 +314,7 @@ app.layout = dmc.MantineProvider(
305
  dcc.Tabs(
306
  id="leaderboard-tabs",
307
  value="Countries",
308
- children=[ # wrap Tabs here
309
  dcc.Tab(
310
  label="Countries",
311
  value="Countries",
@@ -321,11 +330,9 @@ app.layout = dmc.MantineProvider(
321
  "border": "none",
322
  "padding": "10px 18px",
323
  "fontWeight": "700",
324
- "borderBottom": "3px solid #082030", # underline only
325
  },
326
- children=[
327
- create_leaderboard(filtered_df, "countries")
328
- ],
329
  ),
330
  dcc.Tab(
331
  label="Developers",
@@ -344,9 +351,7 @@ app.layout = dmc.MantineProvider(
344
  "fontWeight": "700",
345
  "borderBottom": "3px solid #082030",
346
  },
347
- children=[
348
- create_leaderboard(filtered_df, "developers")
349
- ],
350
  ),
351
  dcc.Tab(
352
  label="Models",
@@ -365,9 +370,7 @@ app.layout = dmc.MantineProvider(
365
  "fontWeight": "700",
366
  "borderBottom": "3px solid #082030",
367
  },
368
- children=[
369
- create_leaderboard(filtered_df, "models")
370
- ],
371
  ),
372
  ],
373
  ),
@@ -379,7 +382,6 @@ app.layout = dmc.MantineProvider(
379
  "marginBottom": "64px",
380
  "marginLeft": "50px",
381
  "marginRight": "50px",
382
- # 'maxWidth': '1250px',
383
  },
384
  ),
385
  ],
@@ -392,16 +394,88 @@ app.layout = dmc.MantineProvider(
392
  ],
393
  )
394
 
 
395
  # Callbacks for interactivity
396
  # -- helper utilities to consolidate duplicated callback logic --
397
- def _apply_time_slider(slider_value):
 
 
 
 
 
 
398
  if slider_value and len(slider_value) == 2:
399
  start = pd.to_datetime(slider_value[0], unit="s")
400
  end = pd.to_datetime(slider_value[1], unit="s")
401
- return filtered_df[(filtered_df["time"] >= start) & (filtered_df["time"] <= end)]
402
- return filtered_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
- def _leaderboard_callback_logic(n_clicks, slider_value, current_label, group_col, filename, default_label="▼ Show Top 50", chip_color="#F0F9FF"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  # Normalize label on first load
406
  if current_label is None:
407
  current_label = default_label
@@ -417,13 +491,20 @@ def _leaderboard_callback_logic(n_clicks, slider_value, current_label, group_col
417
  else:
418
  top_n, new_label = 10, "▼ Show Top 50"
419
 
420
- # Apply time filter and build table
421
- df_time = _apply_time_slider(slider_value)
422
- df, download_df = get_top_n_leaderboard(df_time, group_col, top_n)
423
- return render_table_content(df, download_df, chip_color=chip_color, filename=filename), new_label
 
 
 
 
 
 
 
 
424
  # -- end helpers --
425
 
426
- # ...existing code...
427
 
428
  # Callbacks for interactivity (modularized)
429
  @app.callback(
@@ -444,6 +525,7 @@ def update_top_countries(n_clicks, slider_value, current_label):
444
  chip_color="#F0F9FF",
445
  )
446
 
 
447
  @app.callback(
448
  Output("top_developers-table", "children"),
449
  Output("top_developers-toggle", "children"),
@@ -462,6 +544,7 @@ def update_top_developers(n_clicks, slider_value, current_label):
462
  chip_color="#F0F9FF",
463
  )
464
 
 
465
  @app.callback(
466
  Output("top_models-table", "children"),
467
  Output("top_models-toggle", "children"),
@@ -480,10 +563,8 @@ def update_top_models(n_clicks, slider_value, current_label):
480
  chip_color="#F0F9FF",
481
  )
482
 
483
- @app.callback(
484
- Output("time-slider", "label"),
485
- Input("time-slider", "value")
486
- )
487
  def update_range_labels(values):
488
  start_label = pd.to_datetime(values[0], unit="s").strftime("%b %Y")
489
  end_label = pd.to_datetime(values[1], unit="s").strftime("%b %Y")
 
1
  from dash import Dash, html, dcc, Input, Output, State
2
  import pandas as pd
3
  import dash_mantine_components as dmc
4
+ import duckdb
5
  import time
6
  from graphs.leaderboard import (
7
  create_leaderboard,
 
13
  app = Dash()
14
  server = app.server
15
 
16
+ # DuckDB connection (global)
17
+ con = duckdb.connect(database=":memory:", read_only=False)
18
+
19
+ # Load parquet file from Hugging Face using DuckDB
20
  HF_DATASET_ID = "emsesc/open_model_evolution_data"
21
+ hf_parquet_url = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/filtered_df.parquet"
22
+
23
+ print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}")
 
 
 
 
 
24
  try:
25
  overall_start_time = time.time()
 
 
 
26
 
27
+ # Install and load httpfs extension for remote file access
28
+ con.execute("INSTALL httpfs;")
29
+ con.execute("LOAD httpfs;")
30
+
31
+ # Create a view that references the remote parquet file
32
+ con.execute(f"""
33
+ CREATE OR REPLACE VIEW filtered_df AS
34
+ SELECT * FROM read_parquet('{hf_parquet_url}')
35
+ """)
36
+
37
+ # Get column list and basic info
38
+ columns = con.execute("DESCRIBE filtered_df").fetchdf()
39
+ print("Columns:", columns["column_name"].tolist())
40
+
41
+ # Get time range for slider
42
+ time_range = con.execute(
43
+ "SELECT MIN(time) as min_time, MAX(time) as max_time FROM filtered_df"
44
+ ).fetchdf()
45
+ start_dt = pd.to_datetime(time_range["min_time"].iloc[0])
46
+ end_dt = pd.to_datetime(time_range["max_time"].iloc[0])
47
+
48
+ msg = (
49
+ f"Successfully connected to dataset in {time.time() - overall_start_time:.2f}s."
50
+ )
51
  print(msg)
52
  except Exception as e:
53
  err_msg = f"Failed to load dataset. Error: {e}"
54
  print(err_msg)
55
+ raise
 
 
56
 
57
  # Create a dcc slider for time range selection by year (readable marks)
 
 
58
  start_ts = int(start_dt.timestamp())
59
  end_ts = int(end_dt.timestamp())
60
 
 
86
  marks=marks,
87
  style={"width": "70%", "margin": "0 auto"},
88
  labelAlwaysOn=False,
 
 
 
 
89
  )
90
 
91
  # App layout
 
201
  # Intro / description below header (kept but styled to match layout)
202
  # Title
203
  html.Div(
204
+ children="Model Leaderboard",
205
  style={
206
  "fontSize": 40,
207
  "fontWeight": "700",
 
214
  html.Div(
215
  children=[
216
  html.Button(
217
+ "Read the paper",
218
  id="my-button",
219
  style={
220
  "padding": "10px 20px",
 
303
  "gap": "24px",
304
  "padding": "32px",
305
  "alignItems": "flex-start",
 
306
  "marginLeft": "100px",
307
  "marginRight": "100px",
308
  "backgroundColor": "#FFFBF9",
 
314
  dcc.Tabs(
315
  id="leaderboard-tabs",
316
  value="Countries",
317
+ children=[
318
  dcc.Tab(
319
  label="Countries",
320
  value="Countries",
 
330
  "border": "none",
331
  "padding": "10px 18px",
332
  "fontWeight": "700",
333
+ "borderBottom": "3px solid #082030",
334
  },
335
+ children=[create_leaderboard(con, "countries")],
 
 
336
  ),
337
  dcc.Tab(
338
  label="Developers",
 
351
  "fontWeight": "700",
352
  "borderBottom": "3px solid #082030",
353
  },
354
+ children=[create_leaderboard(con, "developers")],
 
 
355
  ),
356
  dcc.Tab(
357
  label="Models",
 
370
  "fontWeight": "700",
371
  "borderBottom": "3px solid #082030",
372
  },
373
+ children=[create_leaderboard(con, "models")],
 
 
374
  ),
375
  ],
376
  ),
 
382
  "marginBottom": "64px",
383
  "marginLeft": "50px",
384
  "marginRight": "50px",
 
385
  },
386
  ),
387
  ],
 
394
  ],
395
  )
396
 
397
+
398
  # Callbacks for interactivity
399
  # -- helper utilities to consolidate duplicated callback logic --
400
+ def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n):
401
+ """
402
+ Query DuckDB directly to get top N entries with metadata
403
+ This minimizes data transfer by doing aggregation in DuckDB
404
+ """
405
+ # Build time filter clause
406
+ time_clause = ""
407
  if slider_value and len(slider_value) == 2:
408
  start = pd.to_datetime(slider_value[0], unit="s")
409
  end = pd.to_datetime(slider_value[1], unit="s")
410
+ time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"
411
+
412
+ # Build the aggregation query to get top N with all needed metadata
413
+ # This query groups by the target column and aggregates downloads
414
+ # while collecting all metadata we need for chips
415
+ query = f"""
416
+ WITH base_data AS (
417
+ SELECT
418
+ {group_col},
419
+ CASE
420
+ WHEN org_country_single = 'HF' THEN 'United States of America'
421
+ WHEN org_country_single = 'International' THEN 'International/Online'
422
+ WHEN org_country_single = 'Online' THEN 'International/Online'
423
+ ELSE org_country_single
424
+ END AS org_country_single,
425
+ author,
426
+ merged_country_groups_single,
427
+ merged_modality,
428
+ downloads,
429
+ estimated_parameters,
430
+ model
431
+ FROM filtered_df
432
+ {time_clause}
433
+ ),
434
+
435
+ -- Compute the total downloads for all rows in the time range
436
+ total_downloads_cte AS (
437
+ SELECT SUM(downloads) AS total_downloads_all
438
+ FROM base_data
439
+ ),
440
 
441
+ -- Compute per-group totals and their percentage of all downloads
442
+ top_items AS (
443
+ SELECT
444
+ b.{group_col} AS name,
445
+ SUM(b.downloads) AS total_downloads,
446
+ ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
447
+ -- Pick first non-null metadata values for reference
448
+ ANY_VALUE(b.org_country_single) AS org_country_single,
449
+ ANY_VALUE(b.author) AS author,
450
+ ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
451
+ ANY_VALUE(b.merged_modality) AS merged_modality,
452
+ ANY_VALUE(b.model) AS model
453
+ FROM base_data b
454
+ CROSS JOIN total_downloads_cte t
455
+ GROUP BY b.{group_col}, t.total_downloads_all
456
+ )
457
+
458
+ SELECT *
459
+ FROM top_items
460
+ ORDER BY total_downloads DESC
461
+ LIMIT {top_n};
462
+ """
463
+
464
+ print("Executing DuckDB query for filtered top N:")
465
+ print(query) # Print the query for debugging
466
+
467
+ return con.execute(query).fetchdf()
468
+
469
+
470
+ def _leaderboard_callback_logic(
471
+ n_clicks,
472
+ slider_value,
473
+ current_label,
474
+ group_col,
475
+ filename,
476
+ default_label="▼ Show Top 50",
477
+ chip_color="#F0F9FF",
478
+ ):
479
  # Normalize label on first load
480
  if current_label is None:
481
  current_label = default_label
 
491
  else:
492
  top_n, new_label = 10, "▼ Show Top 50"
493
 
494
+ # Get filtered and aggregated data directly from DuckDB
495
+ df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n)
496
+ print("CALLBACK LOGIC - Filtered DataFrame:")
497
+ print(df_filtered.head()) # Print first 5 rows for debugging
498
+
499
+ # Process the already-filtered data
500
+ df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n)
501
+ return render_table_content(
502
+ df, download_df, chip_color=chip_color, filename=filename
503
+ ), new_label
504
+
505
+
506
  # -- end helpers --
507
 
 
508
 
509
  # Callbacks for interactivity (modularized)
510
  @app.callback(
 
525
  chip_color="#F0F9FF",
526
  )
527
 
528
+
529
  @app.callback(
530
  Output("top_developers-table", "children"),
531
  Output("top_developers-toggle", "children"),
 
544
  chip_color="#F0F9FF",
545
  )
546
 
547
+
548
  @app.callback(
549
  Output("top_models-table", "children"),
550
  Output("top_models-toggle", "children"),
 
563
  chip_color="#F0F9FF",
564
  )
565
 
566
+
567
+ @app.callback(Output("time-slider", "label"), Input("time-slider", "value"))
 
 
568
  def update_range_labels(values):
569
  start_label = pd.to_datetime(values[0], unit="s").strftime("%b %Y")
570
  end_label = pd.to_datetime(values[1], unit="s").strftime("%b %Y")
graphs/leaderboard.py CHANGED
@@ -47,6 +47,33 @@ country_icon_map = {
47
  "Switzerland": "🇨🇭",
48
  "User": "👤",
49
  "International/Online": "🌐",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
 
52
  company_icon_map = {
@@ -65,7 +92,7 @@ meta_cols_map = {
65
  "author",
66
  "merged_country_groups_single",
67
  "merged_modality",
68
- "downloads",
69
  ],
70
  }
71
 
@@ -370,34 +397,49 @@ def render_table(
370
  )
371
 
372
 
373
- # Function to get top N leaderboard
374
  def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  top = (
376
- filtered_df.groupby(group_col)["downloads"]
377
  .sum()
378
- .nlargest(top_n)
379
  .reset_index()
380
- .rename(columns={group_col: "Name", "downloads": "Total Value"})
381
  )
382
- total_value = top["Total Value"].sum()
383
- top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
384
 
385
  # Create a downloadable version of the leaderboard
386
  download_top = top.copy()
387
  download_top["Total Value"] = download_top["Total Value"].astype(int)
388
  download_top["% of total"] = download_top["% of total"].round(2)
389
 
390
- top["Name"].replace("User", "user")
 
391
 
392
  # All relevant metadata columns
393
  meta_cols = meta_cols_map.get(group_col, [])
 
394
  # Collect all metadata per top n for each category (country, author, model)
395
  meta_map = {}
396
  download_map = {}
 
397
  for name in top["Name"]:
398
  name_data = filtered_df[filtered_df[group_col] == name]
399
  meta_map[name] = {}
400
  download_map[name] = {}
 
401
  for col in meta_cols:
402
  if col in name_data.columns:
403
  unique_vals = name_data[col].unique()
@@ -408,13 +450,15 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
408
  def build_metadata(nm):
409
  meta = meta_map.get(nm, {})
410
  chips = []
 
411
  # Countries
412
  for c in meta.get("org_country_single", []):
413
  if c == "United States of America":
414
  c = "USA"
415
  if c == "user":
416
  c = "User"
417
- chips.append((country_icon_map.get(c, ""), c))
 
418
  # Author
419
  for a in meta.get("author", []):
420
  icon = company_icon_map.get(a, "")
@@ -424,21 +468,22 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
424
  else:
425
  icon = "👤"
426
  chips.append((icon, a))
 
427
  # Downloads
428
- # Sum downloads if multiple entries
429
  total_downloads = sum(
430
- d for d in meta.get("downloads", []) if pd.notna(d)
431
- ) # Check if d is not NaN
432
  if total_downloads:
433
  chips.append(("⬇️", f"{int(total_downloads):,}"))
434
 
435
  # Modality
436
  for m in meta.get("merged_modality", []):
437
- chips.append(("", m))
 
438
 
439
  # Estimated Parameters
440
  for p in meta.get("estimated_parameters", []):
441
- if pd.notna(p): # Check if p is not NaN
442
  if p >= 1e9:
443
  p_str = f"{p / 1e9:.1f}B"
444
  elif p >= 1e6:
@@ -446,28 +491,32 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
446
  elif p >= 1e3:
447
  p_str = f"{p / 1e3:.1f}K"
448
  else:
449
- p_str = str(p)
450
  chips.append(("⚙️", p_str))
 
451
  return chips
452
 
453
- # Function to create downloadable dataframe
454
  def build_download_metadata(nm):
455
  meta = download_map.get(nm, {})
456
  download_info = {}
 
457
  for col in meta_cols:
458
- # don't add empty columns
459
  if col not in meta or not meta[col]:
460
  continue
 
461
  vals = meta.get(col, [])
462
  if vals:
463
- # Join list into a single string for CSV
464
- download_info[col] = ", ".join(str(v) for v in vals)
465
  else:
466
  download_info[col] = ""
 
467
  return download_info
468
 
469
  # Apply metadata builder to top dataframe
470
  top["Metadata"] = top["Name"].astype(object).apply(build_metadata)
 
 
471
  download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]]
472
  download_info_df = pd.DataFrame(download_info_list)
473
  download_top = pd.concat([download_top, download_info_df], axis=1)
@@ -475,52 +524,138 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
475
  return top[["Name", "Metadata", "% of total"]], download_top
476
 
477
 
478
- def create_leaderboard(filtered_df, board_type, top_n=10):
479
- if filtered_df.empty:
480
- return html.Div("No data in selected range")
481
-
482
- # Merge HF and USA
483
- filtered_df["org_country_single"] = filtered_df["org_country_single"].replace(
484
- {"HF": "United States of America"}
485
- )
486
- # Merge International and Online
487
- filtered_df["org_country_single"] = filtered_df["org_country_single"].replace(
488
- {"International": "International/Online", "Online": "International/Online"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  )
490
 
491
- # Build leaderboards
492
- top_countries, download_top_countries = get_top_n_leaderboard(
493
- filtered_df, "org_country_single", top_n
494
- )
495
- top_developers, download_top_developers = get_top_n_leaderboard(
496
- filtered_df, "author", top_n
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  )
498
- top_models, download_top_models = get_top_n_leaderboard(filtered_df, "model", top_n)
499
-
500
- if board_type == "countries":
501
- return render_table(
502
- top_countries,
503
- download_top_countries,
504
- "Top Countries",
505
- chip_color="#F0F9FF",
506
- bar_color="#082030",
507
- filename="top_countries",
508
- )
509
- elif board_type == "developers":
510
- return render_table(
511
- top_developers,
512
- download_top_developers,
513
- "Top Developers",
514
- chip_color="#F0F9FF",
515
- bar_color="#082030",
516
- filename="top_developers",
517
- )
518
- else:
519
- return render_table(
520
- top_models,
521
- download_top_models,
522
- "Top Models",
523
- chip_color="#F0F9FF",
524
- bar_color="#082030",
525
- filename="top_models",
526
- )
 
47
  "Switzerland": "🇨🇭",
48
  "User": "👤",
49
  "International/Online": "🌐",
50
+ "Spain": "🇪🇸",
51
+ "Sweden": "🇸🇪",
52
+ "Norway": "🇳🇴",
53
+ "Denmark": "🇩🇰",
54
+ "Austria": "🇦🇹",
55
+ "Belgium": "🇧🇪",
56
+ "Poland": "🇵🇱",
57
+ "Turkey": "🇹🇷",
58
+ "Mexico": "🇲🇽",
59
+ "Argentina": "🇦🇷",
60
+ "Thailand": "🇹🇭",
61
+ "Indonesia": "🇮🇩",
62
+ "Malaysia": "🇲🇾",
63
+ "Philippines": "🇵🇭",
64
+ "Egypt": "🇪🇬",
65
+ "South Africa": "🇿🇦",
66
+ "New Zealand": "🇳🇿",
67
+ "Ireland": "🇮🇪",
68
+ "Portugal": "🇵🇹",
69
+ "Greece": "🇬🇷",
70
+ "Czech Republic": "🇨🇿",
71
+ "Romania": "🇷🇴",
72
+ "Ukraine": "🇺🇦",
73
+ "United Arab Emirates": "🇦🇪",
74
+ "Saudi Arabia": "🇸🇦",
75
+ "Pakistan": "🇵🇰",
76
+ "Bangladesh": "🇧🇩",
77
  }
78
 
79
  company_icon_map = {
 
92
  "author",
93
  "merged_country_groups_single",
94
  "merged_modality",
95
+ "total_downloads",
96
  ],
97
  }
98
 
 
397
  )
398
 
399
 
400
+ # Function to get top N leaderboard (now accepts pandas DataFrame from DuckDB query)
401
  def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
402
+ """
403
+ Get top N entries for a leaderboard
404
+
405
+ Args:
406
+ filtered_df: Pandas DataFrame (already filtered by time from DuckDB query)
407
+ group_col: Column to group by
408
+ top_n: Number of top entries to return
409
+
410
+ Returns:
411
+ tuple: (display_df, download_df)
412
+ """
413
+
414
+ # Group by and get top N
415
  top = (
416
+ filtered_df.groupby(group_col)[["total_downloads", "percent_of_total"]]
417
  .sum()
418
+ .nlargest(top_n, columns="total_downloads")
419
  .reset_index()
420
+ .rename(columns={group_col: "Name", "total_downloads": "Total Value", "percent_of_total": "% of total"})
421
  )
 
 
422
 
423
  # Create a downloadable version of the leaderboard
424
  download_top = top.copy()
425
  download_top["Total Value"] = download_top["Total Value"].astype(int)
426
  download_top["% of total"] = download_top["% of total"].round(2)
427
 
428
+ # Replace "User" in names
429
+ top["Name"] = top["Name"].replace("User", "user")
430
 
431
  # All relevant metadata columns
432
  meta_cols = meta_cols_map.get(group_col, [])
433
+
434
  # Collect all metadata per top n for each category (country, author, model)
435
  meta_map = {}
436
  download_map = {}
437
+
438
  for name in top["Name"]:
439
  name_data = filtered_df[filtered_df[group_col] == name]
440
  meta_map[name] = {}
441
  download_map[name] = {}
442
+
443
  for col in meta_cols:
444
  if col in name_data.columns:
445
  unique_vals = name_data[col].unique()
 
450
  def build_metadata(nm):
451
  meta = meta_map.get(nm, {})
452
  chips = []
453
+
454
  # Countries
455
  for c in meta.get("org_country_single", []):
456
  if c == "United States of America":
457
  c = "USA"
458
  if c == "user":
459
  c = "User"
460
+ chips.append((country_icon_map.get(c, "🌍"), c))
461
+
462
  # Author
463
  for a in meta.get("author", []):
464
  icon = company_icon_map.get(a, "")
 
468
  else:
469
  icon = "👤"
470
  chips.append((icon, a))
471
+
472
  # Downloads
 
473
  total_downloads = sum(
474
+ d for d in meta.get("total_downloads", []) if pd.notna(d)
475
+ )
476
  if total_downloads:
477
  chips.append(("⬇️", f"{int(total_downloads):,}"))
478
 
479
  # Modality
480
  for m in meta.get("merged_modality", []):
481
+ if pd.notna(m):
482
+ chips.append(("", m))
483
 
484
  # Estimated Parameters
485
  for p in meta.get("estimated_parameters", []):
486
+ if pd.notna(p):
487
  if p >= 1e9:
488
  p_str = f"{p / 1e9:.1f}B"
489
  elif p >= 1e6:
 
491
  elif p >= 1e3:
492
  p_str = f"{p / 1e3:.1f}K"
493
  else:
494
+ p_str = str(int(p))
495
  chips.append(("⚙️", p_str))
496
+
497
  return chips
498
 
499
+ # Function to create downloadable dataframe metadata
500
  def build_download_metadata(nm):
501
  meta = download_map.get(nm, {})
502
  download_info = {}
503
+
504
  for col in meta_cols:
 
505
  if col not in meta or not meta[col]:
506
  continue
507
+
508
  vals = meta.get(col, [])
509
  if vals:
510
+ download_info[col] = ", ".join(str(v) for v in vals if pd.notna(v))
 
511
  else:
512
  download_info[col] = ""
513
+
514
  return download_info
515
 
516
  # Apply metadata builder to top dataframe
517
  top["Metadata"] = top["Name"].astype(object).apply(build_metadata)
518
+
519
+ # Build download dataframe with metadata
520
  download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]]
521
  download_info_df = pd.DataFrame(download_info_list)
522
  download_top = pd.concat([download_top, download_info_df], axis=1)
 
524
  return top[["Name", "Metadata", "% of total"]], download_top
525
 
526
 
527
+ def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None):
528
+ """
529
+ Query DuckDB directly to get top N entries with minimal data transfer
530
+
531
+ Args:
532
+ con: DuckDB connection object
533
+ group_col: Column to group by
534
+ top_n: Number of top entries
535
+ time_filter: Optional tuple of (start_timestamp, end_timestamp)
536
+
537
+ Returns:
538
+ Pandas DataFrame with only the rows needed for top N
539
+ """
540
+ # Build time filter clause
541
+ time_clause = ""
542
+ if time_filter:
543
+ start = pd.to_datetime(time_filter[0], unit="s")
544
+ end = pd.to_datetime(time_filter[1], unit="s")
545
+ time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"
546
+
547
+ # Optimized query: first find top N, then get only those rows
548
+ query = f"""
549
+ WITH base_data AS (
550
+ SELECT
551
+ {group_col},
552
+ CASE
553
+ WHEN org_country_single = 'HF' THEN 'United States of America'
554
+ WHEN org_country_single = 'International' THEN 'International/Online'
555
+ WHEN org_country_single = 'Online' THEN 'International/Online'
556
+ ELSE org_country_single
557
+ END AS org_country_single,
558
+ author,
559
+ merged_country_groups_single,
560
+ merged_modality,
561
+ downloads,
562
+ estimated_parameters,
563
+ model
564
+ FROM filtered_df
565
+ {time_clause}
566
+ ),
567
+
568
+ -- Compute the total downloads for all rows in the time range
569
+ total_downloads_cte AS (
570
+ SELECT SUM(downloads) AS total_downloads_all
571
+ FROM base_data
572
+ ),
573
+
574
+ -- Compute per-group totals and their percentage of all downloads
575
+ top_items AS (
576
+ SELECT
577
+ b.{group_col} AS name,
578
+ SUM(b.downloads) AS total_downloads,
579
+ ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
580
+ -- Pick first non-null metadata values for reference
581
+ ANY_VALUE(b.org_country_single) AS org_country_single,
582
+ ANY_VALUE(b.author) AS author,
583
+ ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
584
+ ANY_VALUE(b.merged_modality) AS merged_modality,
585
+ ANY_VALUE(b.model) AS model
586
+ FROM base_data b
587
+ CROSS JOIN total_downloads_cte t
588
+ GROUP BY b.{group_col}, t.total_downloads_all
589
  )
590
 
591
+ SELECT *
592
+ FROM top_items
593
+ ORDER BY total_downloads DESC
594
+ LIMIT {top_n};
595
+ """
596
+
597
+ print("Executing DuckDB query:")
598
+ print(query) # Print the query for debugging
599
+
600
+ try:
601
+ return con.execute(query).fetchdf()
602
+ except Exception as e:
603
+ print(f"Error querying DuckDB: {e}")
604
+ return pd.DataFrame()
605
+
606
+
607
+ def create_leaderboard(con, board_type, top_n=10):
608
+ """
609
+ Create leaderboard using DuckDB connection with optimized queries
610
+
611
+ Args:
612
+ con: DuckDB connection object
613
+ board_type: Type of leaderboard ('countries', 'developers', 'models')
614
+ top_n: Number of top entries to display
615
+
616
+ Returns:
617
+ Dash HTML component with the leaderboard table
618
+ """
619
+ # Map board type to column name
620
+ column_map = {
621
+ "countries": "org_country_single",
622
+ "developers": "author",
623
+ "models": "model"
624
+ }
625
+
626
+ title_map = {
627
+ "countries": "Top Countries",
628
+ "developers": "Top Developers",
629
+ "models": "Top Models"
630
+ }
631
+
632
+ filename_map = {
633
+ "countries": "top_countries",
634
+ "developers": "top_developers",
635
+ "models": "top_models"
636
+ }
637
+
638
+ group_col = column_map.get(board_type)
639
+ if not group_col:
640
+ return html.Div(f"Unknown board type: {board_type}")
641
+
642
+ # Get only the top N rows from DuckDB
643
+ filtered_df = get_top_n_from_duckdb(con, group_col, top_n)
644
+
645
+ if filtered_df.empty:
646
+ return html.Div("No data available")
647
+
648
+ # Process the already-filtered data
649
+ top_data, download_data = get_top_n_leaderboard(filtered_df, group_col, top_n)
650
+
651
+ print(f"Creating leaderboard for {board_type} with top {top_n} entries.")
652
+ print(top_data[0:5]) # Print first 5 rows for debugging
653
+
654
+ return render_table(
655
+ top_data,
656
+ download_data,
657
+ title_map[board_type],
658
+ chip_color="#F0F9FF",
659
+ bar_color="#082030",
660
+ filename=filename_map[board_type],
661
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -6,4 +6,5 @@ dash-mantine-components
6
  dash-bootstrap-components
7
  pyarrow
8
  dash-iconify
9
- datasets
 
 
6
  dash-bootstrap-components
7
  pyarrow
8
  dash-iconify
9
+ datasets
10
+ duckdb