Spaces:

economies-open-ai
/

open-model-evolution

Running

App Files Files Community

emsesc commited on Oct 31

Commit

6ba1ddc

1 Parent(s): 5810b5b

buggy duckdb

Browse files

Files changed (3) hide show

app.py +132 -51
graphs/leaderboard.py +200 -65
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from dash import Dash, html, dcc, Input, Output, State
 import pandas as pd
 import dash_mantine_components as dmc
-from datasets import load_dataset
 import time
 from graphs.leaderboard import (
     create_leaderboard,
@@ -13,34 +13,48 @@ from graphs.leaderboard import (
 app = Dash()
 server = app.server
-# Load parquet file from Hugging Face
 HF_DATASET_ID = "emsesc/open_model_evolution_data"
-hf_parquet_url = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/"
-data_files = {
-    "filtered_df": hf_parquet_url + "filtered_df.parquet",
-    # "weekly_df": hf_parquet_url + "weekly_df.parquet",
-}
-filtered_df = pd.DataFrame()
-print(f"Attempting to load dataset from Hugging Face Hub: {HF_DATASET_ID}")
 try:
     overall_start_time = time.time()
-    dataset = load_dataset("parquet", data_files=data_files)
-    df = dataset["filtered_df"].to_pandas()
-    filtered_df = df.copy()
-    msg = f"Successfully loaded dataset in {time.time() - overall_start_time:.2f}s."
     print(msg)
 except Exception as e:
     err_msg = f"Failed to load dataset. Error: {e}"
     print(err_msg)
-# List columns for reference
-print(filtered_df.columns.tolist())
 # Create a dcc slider for time range selection by year (readable marks)
-start_dt = filtered_df["time"].min()
-end_dt = filtered_df["time"].max()
 start_ts = int(start_dt.timestamp())
 end_ts = int(end_dt.timestamp())
@@ -72,10 +86,6 @@ time_slider = dmc.RangeSlider(
     marks=marks,
     style={"width": "70%", "margin": "0 auto"},
     labelAlwaysOn=False,
-    # thumbChildren=[
-    #     dmc.Text(id="time-slider-thumb-from-label", size="xs", children="Hello"),
-    #     dmc.Text(id="time-slider-thumb-to-label", size="xs"),
-    # ]
 )
 # App layout
@@ -191,7 +201,7 @@ app.layout = dmc.MantineProvider(
                 # Intro / description below header (kept but styled to match layout)
                 # Title
                 html.Div(
-                    children="Model Leaderboard",  # Change this to your desired title
                     style={
                         "fontSize": 40,
                         "fontWeight": "700",
@@ -204,7 +214,7 @@ app.layout = dmc.MantineProvider(
                 html.Div(
                     children=[
                         html.Button(
-                            "Read the paper",  # Change this to your desired button text
                             id="my-button",
                             style={
                                 "padding": "10px 20px",
@@ -293,7 +303,6 @@ app.layout = dmc.MantineProvider(
                         "gap": "24px",
                         "padding": "32px",
                         "alignItems": "flex-start",
-                        # 'margin': '24px auto 64px',     # centered horizontally
                         "marginLeft": "100px",
                         "marginRight": "100px",
                         "backgroundColor": "#FFFBF9",
@@ -305,7 +314,7 @@ app.layout = dmc.MantineProvider(
                         dcc.Tabs(
                             id="leaderboard-tabs",
                             value="Countries",
-                            children=[  # wrap Tabs here
                                 dcc.Tab(
                                     label="Countries",
                                     value="Countries",
@@ -321,11 +330,9 @@ app.layout = dmc.MantineProvider(
                                         "border": "none",
                                         "padding": "10px 18px",
                                         "fontWeight": "700",
-                                        "borderBottom": "3px solid #082030",  # underline only
                                     },
-                                    children=[
-                                        create_leaderboard(filtered_df, "countries")
-                                    ],
                                 ),
                                 dcc.Tab(
                                     label="Developers",
@@ -344,9 +351,7 @@ app.layout = dmc.MantineProvider(
                                         "fontWeight": "700",
                                         "borderBottom": "3px solid #082030",
                                     },
-                                    children=[
-                                        create_leaderboard(filtered_df, "developers")
-                                    ],
                                 ),
                                 dcc.Tab(
                                     label="Models",
@@ -365,9 +370,7 @@ app.layout = dmc.MantineProvider(
                                         "fontWeight": "700",
                                         "borderBottom": "3px solid #082030",
                                     },
-                                    children=[
-                                        create_leaderboard(filtered_df, "models")
-                                    ],
                                 ),
                             ],
                         ),
@@ -379,7 +382,6 @@ app.layout = dmc.MantineProvider(
                         "marginBottom": "64px",
                         "marginLeft": "50px",
                         "marginRight": "50px",
-                        # 'maxWidth': '1250px',
                     },
                 ),
             ],
@@ -392,16 +394,88 @@ app.layout = dmc.MantineProvider(
     ],
 )
 # Callbacks for interactivity
 # -- helper utilities to consolidate duplicated callback logic --
-def _apply_time_slider(slider_value):
     if slider_value and len(slider_value) == 2:
         start = pd.to_datetime(slider_value[0], unit="s")
         end = pd.to_datetime(slider_value[1], unit="s")
-        return filtered_df[(filtered_df["time"] >= start) & (filtered_df["time"] <= end)]
-    return filtered_df
-def _leaderboard_callback_logic(n_clicks, slider_value, current_label, group_col, filename, default_label="▼ Show Top 50", chip_color="#F0F9FF"):
     # Normalize label on first load
     if current_label is None:
         current_label = default_label
@@ -417,13 +491,20 @@ def _leaderboard_callback_logic(n_clicks, slider_value, current_label, group_col
     else:
         top_n, new_label = 10, "▼ Show Top 50"
-    # Apply time filter and build table
-    df_time = _apply_time_slider(slider_value)
-    df, download_df = get_top_n_leaderboard(df_time, group_col, top_n)
-    return render_table_content(df, download_df, chip_color=chip_color, filename=filename), new_label
 # -- end helpers --
-# ...existing code...
 # Callbacks for interactivity (modularized)
 @app.callback(
@@ -444,6 +525,7 @@ def update_top_countries(n_clicks, slider_value, current_label):
         chip_color="#F0F9FF",
     )
 @app.callback(
     Output("top_developers-table", "children"),
     Output("top_developers-toggle", "children"),
@@ -462,6 +544,7 @@ def update_top_developers(n_clicks, slider_value, current_label):
         chip_color="#F0F9FF",
     )
 @app.callback(
     Output("top_models-table", "children"),
     Output("top_models-toggle", "children"),
@@ -480,10 +563,8 @@ def update_top_models(n_clicks, slider_value, current_label):
         chip_color="#F0F9FF",
     )
-@app.callback(
-    Output("time-slider", "label"),
-    Input("time-slider", "value")
-)
 def update_range_labels(values):
     start_label = pd.to_datetime(values[0], unit="s").strftime("%b %Y")
     end_label = pd.to_datetime(values[1], unit="s").strftime("%b %Y")

 from dash import Dash, html, dcc, Input, Output, State
 import pandas as pd
 import dash_mantine_components as dmc
+import duckdb
 import time
 from graphs.leaderboard import (
     create_leaderboard,
 app = Dash()
 server = app.server
+# DuckDB connection (global)
+con = duckdb.connect(database=":memory:", read_only=False)
+# Load parquet file from Hugging Face using DuckDB
 HF_DATASET_ID = "emsesc/open_model_evolution_data"
+hf_parquet_url = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/filtered_df.parquet"
+print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}")
 try:
     overall_start_time = time.time()
+    # Install and load httpfs extension for remote file access
+    con.execute("INSTALL httpfs;")
+    con.execute("LOAD httpfs;")
+    # Create a view that references the remote parquet file
+    con.execute(f"""
+        CREATE OR REPLACE VIEW filtered_df AS
+        SELECT * FROM read_parquet('{hf_parquet_url}')
+    """)
+    # Get column list and basic info
+    columns = con.execute("DESCRIBE filtered_df").fetchdf()
+    print("Columns:", columns["column_name"].tolist())
+    # Get time range for slider
+    time_range = con.execute(
+        "SELECT MIN(time) as min_time, MAX(time) as max_time FROM filtered_df"
+    ).fetchdf()
+    start_dt = pd.to_datetime(time_range["min_time"].iloc[0])
+    end_dt = pd.to_datetime(time_range["max_time"].iloc[0])
+    msg = (
+        f"Successfully connected to dataset in {time.time() - overall_start_time:.2f}s."
+    )
     print(msg)
 except Exception as e:
     err_msg = f"Failed to load dataset. Error: {e}"
     print(err_msg)
+    raise
 # Create a dcc slider for time range selection by year (readable marks)
 start_ts = int(start_dt.timestamp())
 end_ts = int(end_dt.timestamp())
     marks=marks,
     style={"width": "70%", "margin": "0 auto"},
     labelAlwaysOn=False,
 )
 # App layout
                 # Intro / description below header (kept but styled to match layout)
                 # Title
                 html.Div(
+                    children="Model Leaderboard",
                     style={
                         "fontSize": 40,
                         "fontWeight": "700",
                 html.Div(
                     children=[
                         html.Button(
+                            "Read the paper",
                             id="my-button",
                             style={
                                 "padding": "10px 20px",
                         "gap": "24px",
                         "padding": "32px",
                         "alignItems": "flex-start",
                         "marginLeft": "100px",
                         "marginRight": "100px",
                         "backgroundColor": "#FFFBF9",
                         dcc.Tabs(
                             id="leaderboard-tabs",
                             value="Countries",
+                            children=[
                                 dcc.Tab(
                                     label="Countries",
                                     value="Countries",
                                         "border": "none",
                                         "padding": "10px 18px",
                                         "fontWeight": "700",
+                                        "borderBottom": "3px solid #082030",
                                     },
+                                    children=[create_leaderboard(con, "countries")],
                                 ),
                                 dcc.Tab(
                                     label="Developers",
                                         "fontWeight": "700",
                                         "borderBottom": "3px solid #082030",
                                     },
+                                    children=[create_leaderboard(con, "developers")],
                                 ),
                                 dcc.Tab(
                                     label="Models",
                                         "fontWeight": "700",
                                         "borderBottom": "3px solid #082030",
                                     },
+                                    children=[create_leaderboard(con, "models")],
                                 ),
                             ],
                         ),
                         "marginBottom": "64px",
                         "marginLeft": "50px",
                         "marginRight": "50px",
                     },
                 ),
             ],
     ],
 )
 # Callbacks for interactivity
 # -- helper utilities to consolidate duplicated callback logic --
+def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n):
+    """
+    Query DuckDB directly to get top N entries with metadata
+    This minimizes data transfer by doing aggregation in DuckDB
+    """
+    # Build time filter clause
+    time_clause = ""
     if slider_value and len(slider_value) == 2:
         start = pd.to_datetime(slider_value[0], unit="s")
         end = pd.to_datetime(slider_value[1], unit="s")
+        time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"
+    # Build the aggregation query to get top N with all needed metadata
+    # This query groups by the target column and aggregates downloads
+    # while collecting all metadata we need for chips
+    query = f"""
+    WITH base_data AS (
+        SELECT
+            {group_col},
+            CASE
+                WHEN org_country_single = 'HF' THEN 'United States of America'
+                WHEN org_country_single = 'International' THEN 'International/Online'
+                WHEN org_country_single = 'Online' THEN 'International/Online'
+                ELSE org_country_single
+            END AS org_country_single,
+            author,
+            merged_country_groups_single,
+            merged_modality,
+            downloads,
+            estimated_parameters,
+            model
+        FROM filtered_df
+        {time_clause}
+    ),
+    -- Compute the total downloads for all rows in the time range
+    total_downloads_cte AS (
+        SELECT SUM(downloads) AS total_downloads_all
+        FROM base_data
+    ),
+    -- Compute per-group totals and their percentage of all downloads
+    top_items AS (
+        SELECT
+            b.{group_col} AS name,
+            SUM(b.downloads) AS total_downloads,
+            ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
+            -- Pick first non-null metadata values for reference
+            ANY_VALUE(b.org_country_single) AS org_country_single,
+            ANY_VALUE(b.author) AS author,
+            ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
+            ANY_VALUE(b.merged_modality) AS merged_modality,
+            ANY_VALUE(b.model) AS model
+        FROM base_data b
+        CROSS JOIN total_downloads_cte t
+        GROUP BY b.{group_col}, t.total_downloads_all
+    )
+    SELECT *
+    FROM top_items
+    ORDER BY total_downloads DESC
+    LIMIT {top_n};
+    """
+    print("Executing DuckDB query for filtered top N:")
+    print(query)  # Print the query for debugging
+    return con.execute(query).fetchdf()
+def _leaderboard_callback_logic(
+    n_clicks,
+    slider_value,
+    current_label,
+    group_col,
+    filename,
+    default_label="▼ Show Top 50",
+    chip_color="#F0F9FF",
+):
     # Normalize label on first load
     if current_label is None:
         current_label = default_label
     else:
         top_n, new_label = 10, "▼ Show Top 50"
+    # Get filtered and aggregated data directly from DuckDB
+    df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n)
+    print("CALLBACK LOGIC - Filtered DataFrame:")
+    print(df_filtered.head())  # Print first 5 rows for debugging
+    # Process the already-filtered data
+    df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n)
+    return render_table_content(
+        df, download_df, chip_color=chip_color, filename=filename
+    ), new_label
 # -- end helpers --
 # Callbacks for interactivity (modularized)
 @app.callback(
         chip_color="#F0F9FF",
     )
 @app.callback(
     Output("top_developers-table", "children"),
     Output("top_developers-toggle", "children"),
         chip_color="#F0F9FF",
     )
 @app.callback(
     Output("top_models-table", "children"),
     Output("top_models-toggle", "children"),
         chip_color="#F0F9FF",
     )
+@app.callback(Output("time-slider", "label"), Input("time-slider", "value"))
 def update_range_labels(values):
     start_label = pd.to_datetime(values[0], unit="s").strftime("%b %Y")
     end_label = pd.to_datetime(values[1], unit="s").strftime("%b %Y")

graphs/leaderboard.py CHANGED Viewed

@@ -47,6 +47,33 @@ country_icon_map = {
     "Switzerland": "🇨🇭",
     "User": "👤",
     "International/Online": "🌐",
 }
 company_icon_map = {
@@ -65,7 +92,7 @@ meta_cols_map = {
         "author",
         "merged_country_groups_single",
         "merged_modality",
-        "downloads",
     ],
 }
@@ -370,34 +397,49 @@ def render_table(
     )
-# Function to get top N leaderboard
 def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
     top = (
-        filtered_df.groupby(group_col)["downloads"]
         .sum()
-        .nlargest(top_n)
         .reset_index()
-        .rename(columns={group_col: "Name", "downloads": "Total Value"})
     )
-    total_value = top["Total Value"].sum()
-    top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
     # Create a downloadable version of the leaderboard
     download_top = top.copy()
     download_top["Total Value"] = download_top["Total Value"].astype(int)
     download_top["% of total"] = download_top["% of total"].round(2)
-    top["Name"].replace("User", "user")
     # All relevant metadata columns
     meta_cols = meta_cols_map.get(group_col, [])
     # Collect all metadata per top n for each category (country, author, model)
     meta_map = {}
     download_map = {}
     for name in top["Name"]:
         name_data = filtered_df[filtered_df[group_col] == name]
         meta_map[name] = {}
         download_map[name] = {}
         for col in meta_cols:
             if col in name_data.columns:
                 unique_vals = name_data[col].unique()
@@ -408,13 +450,15 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
     def build_metadata(nm):
         meta = meta_map.get(nm, {})
         chips = []
         # Countries
         for c in meta.get("org_country_single", []):
             if c == "United States of America":
                 c = "USA"
             if c == "user":
                 c = "User"
-            chips.append((country_icon_map.get(c, ""), c))
         # Author
         for a in meta.get("author", []):
             icon = company_icon_map.get(a, "")
@@ -424,21 +468,22 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
                 else:
                     icon = "👤"
             chips.append((icon, a))
         # Downloads
-        # Sum downloads if multiple entries
         total_downloads = sum(
-            d for d in meta.get("downloads", []) if pd.notna(d)
-        )  # Check if d is not NaN
         if total_downloads:
             chips.append(("⬇️", f"{int(total_downloads):,}"))
         # Modality
         for m in meta.get("merged_modality", []):
-            chips.append(("", m))
         # Estimated Parameters
         for p in meta.get("estimated_parameters", []):
-            if pd.notna(p):  # Check if p is not NaN
                 if p >= 1e9:
                     p_str = f"{p / 1e9:.1f}B"
                 elif p >= 1e6:
@@ -446,28 +491,32 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
                 elif p >= 1e3:
                     p_str = f"{p / 1e3:.1f}K"
                 else:
-                    p_str = str(p)
                 chips.append(("⚙️", p_str))
         return chips
-    # Function to create downloadable dataframe
     def build_download_metadata(nm):
         meta = download_map.get(nm, {})
         download_info = {}
         for col in meta_cols:
-            # don't add empty columns
             if col not in meta or not meta[col]:
                 continue
             vals = meta.get(col, [])
             if vals:
-                # Join list into a single string for CSV
-                download_info[col] = ", ".join(str(v) for v in vals)
             else:
                 download_info[col] = ""
         return download_info
     # Apply metadata builder to top dataframe
     top["Metadata"] = top["Name"].astype(object).apply(build_metadata)
     download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]]
     download_info_df = pd.DataFrame(download_info_list)
     download_top = pd.concat([download_top, download_info_df], axis=1)
@@ -475,52 +524,138 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
     return top[["Name", "Metadata", "% of total"]], download_top
-def create_leaderboard(filtered_df, board_type, top_n=10):
-    if filtered_df.empty:
-        return html.Div("No data in selected range")
-    # Merge HF and USA
-    filtered_df["org_country_single"] = filtered_df["org_country_single"].replace(
-        {"HF": "United States of America"}
-    )
-    # Merge International and Online
-    filtered_df["org_country_single"] = filtered_df["org_country_single"].replace(
-        {"International": "International/Online", "Online": "International/Online"}
     )
-    # Build leaderboards
-    top_countries, download_top_countries = get_top_n_leaderboard(
-        filtered_df, "org_country_single", top_n
-    )
-    top_developers, download_top_developers = get_top_n_leaderboard(
-        filtered_df, "author", top_n
     )
-    top_models, download_top_models = get_top_n_leaderboard(filtered_df, "model", top_n)
-    if board_type == "countries":
-        return render_table(
-            top_countries,
-            download_top_countries,
-            "Top Countries",
-            chip_color="#F0F9FF",
-            bar_color="#082030",
-            filename="top_countries",
-        )
-    elif board_type == "developers":
-        return render_table(
-            top_developers,
-            download_top_developers,
-            "Top Developers",
-            chip_color="#F0F9FF",
-            bar_color="#082030",
-            filename="top_developers",
-        )
-    else:
-        return render_table(
-            top_models,
-            download_top_models,
-            "Top Models",
-            chip_color="#F0F9FF",
-            bar_color="#082030",
-            filename="top_models",
-        )

     "Switzerland": "🇨🇭",
     "User": "👤",
     "International/Online": "🌐",
+    "Spain": "🇪🇸",
+    "Sweden": "🇸🇪",
+    "Norway": "🇳🇴",
+    "Denmark": "🇩🇰",
+    "Austria": "🇦🇹",
+    "Belgium": "🇧🇪",
+    "Poland": "🇵🇱",
+    "Turkey": "🇹🇷",
+    "Mexico": "🇲🇽",
+    "Argentina": "🇦🇷",
+    "Thailand": "🇹🇭",
+    "Indonesia": "🇮🇩",
+    "Malaysia": "🇲🇾",
+    "Philippines": "🇵🇭",
+    "Egypt": "🇪🇬",
+    "South Africa": "🇿🇦",
+    "New Zealand": "🇳🇿",
+    "Ireland": "🇮🇪",
+    "Portugal": "🇵🇹",
+    "Greece": "🇬🇷",
+    "Czech Republic": "🇨🇿",
+    "Romania": "🇷🇴",
+    "Ukraine": "🇺🇦",
+    "United Arab Emirates": "🇦🇪",
+    "Saudi Arabia": "🇸🇦",
+    "Pakistan": "🇵🇰",
+    "Bangladesh": "🇧🇩",
 }
 company_icon_map = {
         "author",
         "merged_country_groups_single",
         "merged_modality",
+        "total_downloads",
     ],
 }
     )
+# Function to get top N leaderboard (now accepts pandas DataFrame from DuckDB query)
 def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
+    """
+    Get top N entries for a leaderboard
+    Args:
+        filtered_df: Pandas DataFrame (already filtered by time from DuckDB query)
+        group_col: Column to group by
+        top_n: Number of top entries to return
+    Returns:
+        tuple: (display_df, download_df)
+    """
+    # Group by and get top N
     top = (
+        filtered_df.groupby(group_col)[["total_downloads", "percent_of_total"]]
         .sum()
+        .nlargest(top_n, columns="total_downloads")
         .reset_index()
+        .rename(columns={group_col: "Name", "total_downloads": "Total Value", "percent_of_total": "% of total"})
     )
     # Create a downloadable version of the leaderboard
     download_top = top.copy()
     download_top["Total Value"] = download_top["Total Value"].astype(int)
     download_top["% of total"] = download_top["% of total"].round(2)
+    # Replace "User" in names
+    top["Name"] = top["Name"].replace("User", "user")
     # All relevant metadata columns
     meta_cols = meta_cols_map.get(group_col, [])
     # Collect all metadata per top n for each category (country, author, model)
     meta_map = {}
     download_map = {}
     for name in top["Name"]:
         name_data = filtered_df[filtered_df[group_col] == name]
         meta_map[name] = {}
         download_map[name] = {}
         for col in meta_cols:
             if col in name_data.columns:
                 unique_vals = name_data[col].unique()
     def build_metadata(nm):
         meta = meta_map.get(nm, {})
         chips = []
         # Countries
         for c in meta.get("org_country_single", []):
             if c == "United States of America":
                 c = "USA"
             if c == "user":
                 c = "User"
+            chips.append((country_icon_map.get(c, "🌍"), c))
         # Author
         for a in meta.get("author", []):
             icon = company_icon_map.get(a, "")
                 else:
                     icon = "👤"
             chips.append((icon, a))
         # Downloads
         total_downloads = sum(
+            d for d in meta.get("total_downloads", []) if pd.notna(d)
+        )
         if total_downloads:
             chips.append(("⬇️", f"{int(total_downloads):,}"))
         # Modality
         for m in meta.get("merged_modality", []):
+            if pd.notna(m):
+                chips.append(("", m))
         # Estimated Parameters
         for p in meta.get("estimated_parameters", []):
+            if pd.notna(p):
                 if p >= 1e9:
                     p_str = f"{p / 1e9:.1f}B"
                 elif p >= 1e6:
                 elif p >= 1e3:
                     p_str = f"{p / 1e3:.1f}K"
                 else:
+                    p_str = str(int(p))
                 chips.append(("⚙️", p_str))
         return chips
+    # Function to create downloadable dataframe metadata
     def build_download_metadata(nm):
         meta = download_map.get(nm, {})
         download_info = {}
         for col in meta_cols:
             if col not in meta or not meta[col]:
                 continue
             vals = meta.get(col, [])
             if vals:
+                download_info[col] = ", ".join(str(v) for v in vals if pd.notna(v))
             else:
                 download_info[col] = ""
         return download_info
     # Apply metadata builder to top dataframe
     top["Metadata"] = top["Name"].astype(object).apply(build_metadata)
+    # Build download dataframe with metadata
     download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]]
     download_info_df = pd.DataFrame(download_info_list)
     download_top = pd.concat([download_top, download_info_df], axis=1)
     return top[["Name", "Metadata", "% of total"]], download_top
+def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None):
+    """
+    Query DuckDB directly to get top N entries with minimal data transfer
+    Args:
+        con: DuckDB connection object
+        group_col: Column to group by
+        top_n: Number of top entries
+        time_filter: Optional tuple of (start_timestamp, end_timestamp)
+    Returns:
+        Pandas DataFrame with only the rows needed for top N
+    """
+    # Build time filter clause
+    time_clause = ""
+    if time_filter:
+        start = pd.to_datetime(time_filter[0], unit="s")
+        end = pd.to_datetime(time_filter[1], unit="s")
+        time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"
+    # Optimized query: first find top N, then get only those rows
+    query = f"""
+    WITH base_data AS (
+        SELECT
+            {group_col},
+            CASE
+                WHEN org_country_single = 'HF' THEN 'United States of America'
+                WHEN org_country_single = 'International' THEN 'International/Online'
+                WHEN org_country_single = 'Online' THEN 'International/Online'
+                ELSE org_country_single
+            END AS org_country_single,
+            author,
+            merged_country_groups_single,
+            merged_modality,
+            downloads,
+            estimated_parameters,
+            model
+        FROM filtered_df
+        {time_clause}
+    ),
+    -- Compute the total downloads for all rows in the time range
+    total_downloads_cte AS (
+        SELECT SUM(downloads) AS total_downloads_all
+        FROM base_data
+    ),
+    -- Compute per-group totals and their percentage of all downloads
+    top_items AS (
+        SELECT
+            b.{group_col} AS name,
+            SUM(b.downloads) AS total_downloads,
+            ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
+            -- Pick first non-null metadata values for reference
+            ANY_VALUE(b.org_country_single) AS org_country_single,
+            ANY_VALUE(b.author) AS author,
+            ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
+            ANY_VALUE(b.merged_modality) AS merged_modality,
+            ANY_VALUE(b.model) AS model
+        FROM base_data b
+        CROSS JOIN total_downloads_cte t
+        GROUP BY b.{group_col}, t.total_downloads_all
     )
+    SELECT *
+    FROM top_items
+    ORDER BY total_downloads DESC
+    LIMIT {top_n};
+    """
+    print("Executing DuckDB query:")
+    print(query)  # Print the query for debugging
+    try:
+        return con.execute(query).fetchdf()
+    except Exception as e:
+        print(f"Error querying DuckDB: {e}")
+        return pd.DataFrame()
+def create_leaderboard(con, board_type, top_n=10):
+    """
+    Create leaderboard using DuckDB connection with optimized queries
+    Args:
+        con: DuckDB connection object
+        board_type: Type of leaderboard ('countries', 'developers', 'models')
+        top_n: Number of top entries to display
+    Returns:
+        Dash HTML component with the leaderboard table
+    """
+    # Map board type to column name
+    column_map = {
+        "countries": "org_country_single",
+        "developers": "author",
+        "models": "model"
+    }
+    title_map = {
+        "countries": "Top Countries",
+        "developers": "Top Developers",
+        "models": "Top Models"
+    }
+    filename_map = {
+        "countries": "top_countries",
+        "developers": "top_developers",
+        "models": "top_models"
+    }
+    group_col = column_map.get(board_type)
+    if not group_col:
+        return html.Div(f"Unknown board type: {board_type}")
+    # Get only the top N rows from DuckDB
+    filtered_df = get_top_n_from_duckdb(con, group_col, top_n)
+    if filtered_df.empty:
+        return html.Div("No data available")
+    # Process the already-filtered data
+    top_data, download_data = get_top_n_leaderboard(filtered_df, group_col, top_n)
+    print(f"Creating leaderboard for {board_type} with top {top_n} entries.")
+    print(top_data[0:5])  # Print first 5 rows for debugging
+    return render_table(
+        top_data,
+        download_data,
+        title_map[board_type],
+        chip_color="#F0F9FF",
+        bar_color="#082030",
+        filename=filename_map[board_type],
     )

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ dash-mantine-components
 dash-bootstrap-components
 pyarrow
 dash-iconify
-datasets

 dash-bootstrap-components
 pyarrow
 dash-iconify
+datasets
+duckdb