import pandas as pd from dash import html, dcc from dash_iconify import DashIconify import dash_mantine_components as dmc import base64 import countryflag button_style = { "display": "inline-block", "marginBottom": "10px", "marginRight": "15px", "marginTop": "30px", "padding": "6px 16px", "backgroundColor": "#082030", "color": "white", "borderRadius": "6px", "textDecoration": "none", "fontWeight": "bold", "fontSize": "14px", } company_icon_map = { "google": "../assets/icons/google.png", "distilbert": "../assets/icons/hugging-face.png", "sentence-transformers": "../assets/icons/hugging-face.png", "facebook": "../assets/icons/meta.png", "openai": "../assets/icons/openai.png", } country_emoji_fallback = { "User": "👤", "Organization": "🏢", "Model": "📦", } meta_cols_map = { "org_country_single": ["org_country_single"], "author": ["org_country_single", "author", "merged_country_groups_single"], "derived_author": ["org_country_single", "derived_author", "merged_country_groups_single"], "model": [ "org_country_single", "author", "derived_author", "merged_country_groups_single", "merged_modality", "total_downloads", ], } # Chip renderer def chip(text, bg_color="#F0F0F0"): return html.Span( text, style={ "backgroundColor": bg_color, "padding": "4px 10px", "borderRadius": "12px", "margin": "2px", "display": "inline-flex", "alignItems": "center", "fontSize": "14px", }, ) # Progress bar for % of total def progress_bar(percent, bar_color="#AC482A"): return html.Div( style={ "position": "relative", "backgroundColor": "#E0E0E0", "borderRadius": "8px", "height": "20px", "width": "100%", "overflow": "hidden", }, children=[ html.Div( style={ "backgroundColor": bar_color, "width": f"{percent}%", "height": "100%", "borderRadius": "8px", "transition": "width 0.5s", } ), html.Div( f"{percent:.1f}%", style={ "position": "absolute", "top": 0, "left": "50%", "transform": "translateX(-50%)", "color": "black", "fontWeight": "bold", "fontSize": "12px", "lineHeight": "20px", "textAlign": "center", }, ), ], ) # Helper to convert DataFrame to CSV and encode for download def df_to_download_link(df, filename): csv_string = df.to_csv(index=False) b64 = base64.b64encode(csv_string.encode()).decode() return html.Div( html.A( children=dmc.ActionIcon( DashIconify(icon="mdi:download", width=24), size="lg", color="#082030", ), id=f"download-{filename}", download=f"{filename}.csv", href=f"data:text/csv;base64,{b64}", target="_blank", title="Download CSV", style={ "padding": "6px 12px", "display": "inline-flex", "alignItems": "center", "justifyContent": "center", }, ), style={"textAlign": "right"}, ) # Helper to get popover content for each metadata type def get_metadata_popover_content(icon, name, meta_type): popover_texts = { "country": f"Country: {name}", "author": f"Author/Organization: {name}", "downloads": f"Total downloads: {name}", "modality": f"Modality: {name}", "parameters": f"Estimated parameters: {name}", } return popover_texts.get(meta_type, name) # Chip renderer with hovercard def chip_with_hovercard(text, bg_color="#F0F0F0", meta_type=None, icon=None): hovercard_content = get_metadata_popover_content(icon, text, meta_type) return dmc.HoverCard( width=220, shadow="md", position="top", children=[ dmc.HoverCardTarget( html.Span( text, style={ "backgroundColor": bg_color, "padding": "4px 10px", "borderRadius": "12px", "margin": "2px", "display": "inline-flex", "alignItems": "center", "fontSize": "14px", "cursor": "pointer", }, ) ), dmc.HoverCardDropdown( dmc.Text(hovercard_content, size="sm") ), ], ) # Render multiple chips in one row, each with popover def render_chips(metadata_list, chip_color): chips = [] for icon, name, meta_type in metadata_list: if isinstance(icon, str) and icon.endswith((".png", ".jpg", ".jpeg", ".svg")): chips.append( dmc.HoverCard( width=220, shadow="md", position="top", children=[ dmc.HoverCardTarget( html.Span( [ html.Img( src=icon, style={"height": "18px", "marginRight": "6px"} ), name, ], style={ "backgroundColor": chip_color, "padding": "4px 10px", "borderRadius": "12px", "margin": "2px", "display": "inline-flex", "alignItems": "left", "fontSize": "14px", "cursor": "pointer", }, ) ), dmc.HoverCardDropdown( dmc.Text(get_metadata_popover_content(icon, name, meta_type), size="sm") ), ], ) ) else: chips.append(chip_with_hovercard(f"{icon} {name}", chip_color, meta_type, icon)) return html.Div( chips, style={"display": "flex", "flexWrap": "wrap", "justifyContent": "left"} ) def render_table_content( df, download_df, chip_color, bar_color="#AC482A", filename="data" ): return html.Div( [ # Add download button above the table df_to_download_link(download_df, filename), html.Table( [ html.Thead( html.Tr( [ html.Th( "Rank", style={ "backgroundColor": "#F0F0F0", "textAlign": "left", }, ), html.Th( "Name", style={ "backgroundColor": "#F0F0F0", "textAlign": "left", }, ), html.Th( "Metadata", style={ "backgroundColor": "#F0F0F0", "textAlign": "left", "marginRight": "10px", }, ), html.Th( "% of Total", style={ "backgroundColor": "#F0F0F0", "textAlign": "left", }, ), ] ) ), html.Tbody( [ html.Tr( [ html.Td(idx + 1, style={"textAlign": "center"}), html.Td(row["Name"], style={"textAlign": "left"}), html.Td(render_chips(row["Metadata"], chip_color)), html.Td( progress_bar(row["% of total"], bar_color), style={"textAlign": "center"}, ), ] ) for idx, row in df.iterrows() ] ), ], style={"borderCollapse": "collapse", "width": "100%"}, ), ] ) # Function to get top N leaderboard (now accepts pandas DataFrame from DuckDB query) def get_top_n_leaderboard(filtered_df, group_col, top_n=10, derived_author_toggle=True): """ Get top N entries for a leaderboard Args: filtered_df: Pandas DataFrame (already filtered by time from DuckDB query) group_col: Column to group by top_n: Number of top entries to return derived_author_toggle: Whether to use derived_author or author column Returns: tuple: (display_df, download_df) """ # Group by and get top N top = ( filtered_df.groupby(group_col)[["total_downloads", "percent_of_total"]] .sum() .nlargest(top_n, columns="total_downloads") .reset_index() .rename(columns={group_col: "Name", "total_downloads": "Total Value", "percent_of_total": "% of total"}) ) # Create a downloadable version of the leaderboard download_top = top.copy() download_top["Total Value"] = download_top["Total Value"].astype(int) download_top["% of total"] = download_top["% of total"].round(2) # Replace "User" in names top["Name"] = top["Name"].replace("User", "user") # All relevant metadata columns meta_cols = meta_cols_map.get(group_col, []) # Collect all metadata per top n for each category (country, author, model) meta_map = {} download_map = {} for name in top["Name"]: name_data = filtered_df[filtered_df[group_col] == name] meta_map[name] = {} download_map[name] = {} for col in meta_cols: if col in name_data.columns: unique_vals = name_data[col].unique() meta_map[name][col] = list(unique_vals) download_map[name][col] = list(unique_vals) # Function to build metadata chips def build_metadata(nm): meta = meta_map.get(nm, {}) chips = [] # Countries for c in meta.get("org_country_single", []): if c == "United States of America": c = "USA" if c == "user": c = "User" # Try countryflag.getflag(), fallback to dictionary if fails try: flag_emoji = countryflag.getflag(c) # If countryflag returns empty or None, fallback if not flag_emoji or flag_emoji == c: flag_emoji = country_emoji_fallback.get(c, "🌍") except Exception: flag_emoji = country_emoji_fallback.get(c, "🌍") chips.append((flag_emoji, c, "country")) # Author - use derived_author_toggle to determine which column author_key = "derived_author" if derived_author_toggle else "author" for a in meta.get(author_key, []): icon = company_icon_map.get(a, "") if icon == "": if meta.get("merged_country_groups_single", ["User"])[0] != "User": icon = "🏢" else: icon = "👤" chips.append((icon, a, "author")) # Downloads total_downloads = sum( d for d in meta.get("total_downloads", []) if pd.notna(d) ) if total_downloads: chips.append(("⬇️", f"{int(total_downloads):,}", "downloads")) # Modality for m in meta.get("merged_modality", []): if pd.notna(m): chips.append(("", m, "modality")) return chips # Function to create downloadable dataframe metadata def build_download_metadata(nm): meta = download_map.get(nm, {}) download_info = {} for col in meta_cols: if col not in meta or not meta[col]: continue vals = meta.get(col, []) if vals: download_info[col] = ", ".join(str(v) for v in vals if pd.notna(v)) else: download_info[col] = "" return download_info # Apply metadata builder to top dataframe top["Metadata"] = top["Name"].astype(object).apply(build_metadata) # Build download dataframe with metadata download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]] download_info_df = pd.DataFrame(download_info_list) download_top = pd.concat([download_top, download_info_df], axis=1) return top[["Name", "Metadata", "% of total"]], download_top def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None, view="all_downloads"): """ Query DuckDB directly to get top N entries with minimal data transfer Args: con: DuckDB connection object group_col: Column to group by top_n: Number of top entries time_filter: Optional tuple of (start_timestamp, end_timestamp) Returns: Pandas DataFrame with only the rows needed for top N """ # Build time filter clause time_clause = "" if time_filter: start = pd.to_datetime(time_filter[0], unit="s") end = pd.to_datetime(time_filter[1], unit="s") time_clause = f"WHERE time >= '{start}' AND time <= '{end}'" # Optimized query: first find top N, then get only those rows query = f""" WITH base_data AS ( SELECT {group_col}, CASE WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' WHEN org_country_single IN ('International', 'Online') THEN 'International/Online' ELSE org_country_single END AS org_country_single, author, derived_author, merged_country_groups_single, merged_modality, downloads, model FROM {view} {time_clause} ), -- Compute the total downloads for all rows in the time range total_downloads_cte AS ( SELECT SUM(downloads) AS total_downloads_all FROM base_data ), -- Compute per-group totals and their percentage of all downloads top_items AS ( SELECT b.{group_col} AS name, SUM(b.downloads) AS total_downloads, ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total, -- Pick first non-null metadata values for reference ANY_VALUE(b.org_country_single) AS org_country_single, ANY_VALUE(b.author) AS author, ANY_VALUE(b.derived_author) AS derived_author, ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single, ANY_VALUE(b.merged_modality) AS merged_modality, ANY_VALUE(b.model) AS model FROM base_data b CROSS JOIN total_downloads_cte t GROUP BY b.{group_col}, t.total_downloads_all ) SELECT * FROM top_items ORDER BY total_downloads DESC LIMIT {top_n}; """ try: return con.execute(query).fetchdf() except Exception as e: print(f"Error querying DuckDB: {e}") return pd.DataFrame()