Spaces:

economies-open-ai
/

open-model-evolution

Running

File size: 21,377 Bytes

import pandas as pd
from dash import html, dcc
from dash_iconify import DashIconify
import dash_mantine_components as dmc
import base64
import countryflag

button_style = {
    "display": "inline-block",
    "marginBottom": "10px",
    "marginRight": "15px",
    "marginTop": "30px",
    "padding": "6px 16px",
    "backgroundColor": "#082030",
    "color": "white",
    "borderRadius": "6px",
    "textDecoration": "none",
    "fontWeight": "bold",
    "fontSize": "14px",
}

company_icon_map = {
    "google": "../assets/icons/google.png",
    "distilbert": "../assets/images/hf.svg",
    "sentence-transformers": "../assets/images/hf.svg",
    "facebook": "../assets/icons/meta.png",
    "openai": "../assets/icons/openai.png",
    "amazon": "../assets/icons/amazon.png",
    "microsoft": "../assets/icons/microsoft.png",
}

country_emoji_fallback = {
    "User": "👤",
    "Organization": "🏢",
    "Model": "📦",
}

meta_cols_map = {
    "org_country_single": ["org_country_single", "total_downloads"],
    "author": [
        "org_country_single",
        "author",
        "total_downloads",
    ],
    "derived_author": [
        "org_country_single",
        "derived_author",
        "total_downloads",
    ],
    "model": [
        "org_country_single",
        "author",
        "derived_author",
        "merged_modality",
        "total_downloads",
    ],
}


# Chip renderer
def chip(text, bg_color="#F0F0F0"):
    return html.Span(
        text,
        style={
            "backgroundColor": bg_color,
            "padding": "4px 10px",
            "borderRadius": "12px",
            "margin": "2px",
            "display": "inline-flex",
            "alignItems": "center",
            "fontSize": "14px",
        },
    )


# Progress bar for % of total
def progress_bar(percent, bar_color="#AC482A"):
    return html.Div(
        style={
            "position": "relative",
            "backgroundColor": "#E0E0E0",
            "borderRadius": "8px",
            "height": "20px",
            "width": "100%",
            "overflow": "hidden",
        },
        children=[
            html.Div(
                style={
                    "backgroundColor": bar_color,
                    "width": f"{percent}%",
                    "height": "100%",
                    "borderRadius": "8px",
                    "transition": "width 0.5s",
                }
            ),
            html.Div(
                f"{percent:.1f}%",
                style={
                    "position": "absolute",
                    "top": 0,
                    "left": "50%",
                    "transform": "translateX(-50%)",
                    "color": "black",
                    "fontWeight": "bold",
                    "fontSize": "12px",
                    "lineHeight": "20px",
                    "textAlign": "center",
                },
            ),
        ],
    )


# Helper to convert DataFrame to CSV and encode for download
def df_to_download_link(df, filename):
    csv_string = df.to_csv(index=False)
    b64 = base64.b64encode(csv_string.encode()).decode()
    return html.Div(
        html.A(
            children=dmc.ActionIcon(
                DashIconify(icon="mdi:download", width=24),
                size="lg",
                color="#082030",
            ),
            id=f"download-{filename}",
            download=f"{filename}.csv",
            href=f"data:text/csv;base64,{b64}",
            target="_blank",
            title="Download CSV",
            style={
                "padding": "6px 12px",
                "display": "inline-flex",
                "alignItems": "center",
                "justifyContent": "center",
            },
        ),
        style={"textAlign": "right"},
    )


# Helper to get popover content for each metadata type
def get_metadata_popover_content(icon, name, meta_type):
    popover_texts = {
        "country": f"Country: {name}",
        "author": f"Author/Organization: {name}",
        "downloads": f"Total downloads: {name}",
        "modality": f"Modality: {name}",
    }
    return popover_texts.get(meta_type, name)


# Chip renderer with hovercard
def chip_with_hovercard(text, bg_color="#F0F0F0", meta_type=None, icon=None):
    hovercard_content = get_metadata_popover_content(icon, text, meta_type)
    return dmc.HoverCard(
        width="auto",
        shadow="md",
        position="top",
        children=[
            dmc.HoverCardTarget(
                html.Span(
                    text,
                    style={
                        "backgroundColor": bg_color,
                        "padding": "4px 10px",
                        "borderRadius": "12px",
                        "margin": "2px",
                        "display": "inline-flex",
                        "alignItems": "center",
                        "fontSize": "14px",
                        "cursor": "pointer",
                        "transition": "background-color 0.15s",
                    },
                    # Add a class for hover effect
                    className="chip-hover-darken"
                )
            ),
            dmc.HoverCardDropdown(dmc.Text(hovercard_content, size="sm")),
        ],
    )


# Render multiple chips in one row, each with popover
def render_chips(metadata_list, chip_color):
    chips = []
    for icon, name, meta_type in metadata_list:
        if isinstance(icon, str) and icon.endswith((".png", ".jpg", ".jpeg", ".svg")):
            chips.append(
                dmc.HoverCard(
                    width=220,
                    shadow="md",
                    position="top",
                    children=[
                        dmc.HoverCardTarget(
                            html.Span(
                                [
                                    html.Img(
                                        src=icon,
                                        style={"height": "18px", "marginRight": "6px"},
                                    ),
                                    name,
                                ],
                                style={
                                    "backgroundColor": chip_color,
                                    "padding": "4px 10px",
                                    "borderRadius": "12px",
                                    "margin": "2px",
                                    "display": "inline-flex",
                                    "alignItems": "left",
                                    "fontSize": "14px",
                                    "cursor": "pointer",
                                },
                            )
                        ),
                        dmc.HoverCardDropdown(
                            dmc.Text(
                                get_metadata_popover_content(icon, name, meta_type),
                                size="sm",
                            )
                        ),
                    ],
                )
            )
        else:
            chips.append(
                chip_with_hovercard(f"{icon} {name}", chip_color, meta_type, icon)
            )
    return html.Div(
        chips, style={"display": "flex", "flexWrap": "wrap", "justifyContent": "left"}
    )


def render_table_content(
    df, download_df, chip_color, bar_color="#AC482A", filename="data"
):
    return html.Div(
        [
            # Add download button above the table
            df_to_download_link(download_df, filename),
            # Wrap the table in a horizontal scroll container so the table can be wide
            html.Div(
                # scroll wrapper
                html.Table(
                    [
                        html.Thead(
                            html.Tr(
                                [
                                    html.Th(
                                        "Rank",
                                        className="rank-col",
                                        style={
                                            "backgroundColor": "#F0F0F0",
                                            "textAlign": "left",
                                        },
                                    ),
                                    html.Th(
                                        "Name",
                                        className="name-col",
                                        style={
                                            "backgroundColor": "#F0F0F0",
                                            "textAlign": "left",
                                        },
                                    ),
                                    html.Th(
                                        "Metadata",
                                        className="metadata-col",
                                        style={
                                            "backgroundColor": "#F0F0F0",
                                            "textAlign": "left",
                                            "marginRight": "10px",
                                        },
                                    ),
                                    html.Th(
                                        "% of Total",
                                        className="percent-col",
                                        style={
                                            "backgroundColor": "#F0F0F0",
                                            "textAlign": "left",
                                        },
                                    ),
                                ]
                            )
                        ),
                        html.Tbody(
                            [
                                html.Tr(
                                    [
                                        html.Td(idx + 1, style={"textAlign": "center"}),
                                        html.Td(row["Name"], className="name-cell", style={"textAlign": "left"}),
                                        html.Td(render_chips(row["Metadata"], chip_color), className="metadata-cell", style={"textAlign": "left", "whiteSpace": "normal", "wordBreak": "break-word"}),
                                        html.Td(
                                            progress_bar(row["% of total"], bar_color),
                                            className="percent-cell",
                                            style={"textAlign": "center", "minWidth": "180px", "padding": "8px"},
                                        ),
                                    ]
                                )
                                for idx, row in df.iterrows()
                            ]
                        ),
                    ],
                    # allow the table to be wider than its container (minWidth prevents squish)
                    style={"borderCollapse": "collapse", "width": "100%", "minWidth": "980px", "tableLayout": "auto"},
                    className="leaderboard-table",
                ),
                className="leaderboard-scroll-wrapper",
                style={"overflowX": "auto", "-webkit-overflow-scrolling": "touch", "width": "100%"},
            ),
        ]
    )


# Function to get top N leaderboard (now accepts pandas DataFrame from DuckDB query)
def get_top_n_leaderboard(filtered_df, group_col, top_n=10, derived_author_toggle=True):
    """
    Get top N entries for a leaderboard

    Args:
        filtered_df: Pandas DataFrame (already filtered by time from DuckDB query)
        group_col: Column to group by
        top_n: Number of top entries to return
        derived_author_toggle: If True, attribute to model uploader (derived_author); if False, attribute to original model creator (author)

    Returns:
        tuple: (display_df, download_df)
    """

    # Group by and get top N
    top = (
        filtered_df.groupby(group_col)[["total_downloads", "percent_of_total"]]
        .sum()
        .nlargest(top_n, columns="total_downloads")
        .reset_index()
        .rename(
            columns={
                group_col: "Name",
                "total_downloads": "Total Value",
                "percent_of_total": "% of total",
            }
        )
    )

    # Create a downloadable version of the leaderboard
    download_top = top.copy()
    download_top["Total Value"] = download_top["Total Value"].astype(int)
    download_top["% of total"] = download_top["% of total"].round(2)

    # All relevant metadata columns
    meta_cols = meta_cols_map.get(group_col, [])

    # Collect all metadata per top n for each category (country, author, model)
    meta_map = {}
    download_map = {}

    for name in top["Name"]:
        name_data = filtered_df[filtered_df[group_col] == name]
        meta_map[name] = {}
        download_map[name] = {}
        for col in meta_cols:
            if col in name_data.columns:
                unique_vals = name_data[col].unique()
                meta_map[name][col] = list(unique_vals)
                download_map[name][col] = list(unique_vals)

    # Function to build metadata chips
    def build_metadata(nm):
        meta = meta_map.get(nm, {})
        chips = []

        # Countries
        for c in meta.get("org_country_single", []):
            if c == "United States of America":
                c = "USA"
            if c == "user":
                c = "User"
            try:
                flag_emoji = countryflag.getflag(c)
                if not flag_emoji or flag_emoji == c:
                    flag_emoji = country_emoji_fallback.get(c, "🌍")
            except Exception:
                flag_emoji = country_emoji_fallback.get(c, "🌍")
            chips.append((flag_emoji, c, "country"))
            # Add downloads chip for country (only once)

        # Author - use derived_author_toggle to determine which column
        author_key = "derived_author" if derived_author_toggle else "author"
        for a in meta.get(author_key, []):
            icon = company_icon_map.get(a, "")
            if icon == "":
                if meta.get("merged_country_groups_single", ["User"])[0] != "User":
                    icon = "🏢"
                else:
                    icon = "👤"
            chips.append((icon, a, "author"))

        # Modality
        for m in meta.get("merged_modality", []):
            if pd.notna(m):
                chips.append(("", m, "modality"))

        # Total downloads
        for d in meta.get("total_downloads", []):
            formatted_downloads = format_large_number(d)
            chips.append(("⬇️", formatted_downloads, "downloads"))

        return chips

    # Function to create downloadable dataframe metadata
    def build_download_metadata(nm):
        meta = download_map.get(nm, {})
        download_info = {}

        for col in meta_cols:
            if col not in meta or not meta[col]:
                continue

            vals = meta.get(col, [])
            if vals:
                download_info[col] = ", ".join(str(v) for v in vals if pd.notna(v))
            else:
                download_info[col] = ""

        return download_info

    # Apply metadata builder to top dataframe
    top["Metadata"] = top["Name"].astype(object).apply(build_metadata)

    # Capitalize "user" back to "User" for display
    top["Name"] = top["Name"].replace("user", "User")

    # Build download dataframe with metadata
    download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]]
    download_info_df = pd.DataFrame(download_info_list)
    download_top = pd.concat([download_top, download_info_df], axis=1)

    return top[["Name", "Metadata", "% of total"]], download_top


def get_top_n_from_duckdb(
    con, group_col, top_n=10, time_filter=None, view="all_downloads"
):
    """
    Query DuckDB directly to get top N entries with minimal data transfer

    Args:
        con: DuckDB connection object
        group_col: Column to group by
        top_n: Number of top entries
        time_filter: Optional tuple of (start_timestamp, end_timestamp)

    Returns:
        Pandas DataFrame with only the rows needed for top N
    """
    # Build time filter clause
    time_clause = ""
    if time_filter:
        start = pd.to_datetime(time_filter[0], unit="s")
        end = pd.to_datetime(time_filter[1], unit="s")
        time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"

    # If grouping by country, group by the transformed country column
    if group_col == "org_country_single":
        group_expr = """CASE 
            WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
            WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online'
            ELSE org_country_single
        END"""
    else:
        group_expr = group_col

    # When grouping by derived_author, lookup the country where derived_author = author
    if group_col == "derived_author":
        query = f"""
        WITH base_data AS (
            SELECT 
                {group_expr} AS group_key,
                CASE 
                    WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
                    WHEN org_country_single IN ('International', 'Online') THEN 'International/Online'
                    ELSE org_country_single
                END AS org_country_single,
                author,
                derived_author,
                merged_country_groups_single,
                merged_modality,
                downloads,
                model
            FROM {view}
            {time_clause}
        ),
        
        -- Create a lookup table for derived_author -> country
        author_country_lookup AS (
            SELECT DISTINCT
                author,
                FIRST_VALUE(org_country_single) OVER (PARTITION BY author ORDER BY downloads DESC) AS author_country
            FROM base_data
            WHERE author IS NOT NULL
        ),

        total_downloads_cte AS (
            SELECT SUM(downloads) AS total_downloads_all
            FROM base_data
        ),

        top_items AS (
            SELECT 
                b.group_key AS name,
                SUM(b.downloads) AS total_downloads,
                ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
                COALESCE(acl.author_country, ANY_VALUE(b.org_country_single)) AS org_country_single,
                ANY_VALUE(b.author) AS author,
                ANY_VALUE(b.derived_author) AS derived_author,
                ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
                ANY_VALUE(b.merged_modality) AS merged_modality,
                ANY_VALUE(b.model) AS model
            FROM base_data b
            CROSS JOIN total_downloads_cte t
            LEFT JOIN author_country_lookup acl ON b.group_key = acl.author
            GROUP BY b.group_key, acl.author_country, t.total_downloads_all
        )

        SELECT *
        FROM top_items
        ORDER BY total_downloads DESC
        LIMIT {top_n};
        """
    else:
        query = f"""
        WITH base_data AS (
            SELECT 
                {group_expr} AS group_key,
                CASE 
                    WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America'
                    WHEN org_country_single IN ('International', 'Online') THEN 'International/Online'
                    ELSE org_country_single
                END AS org_country_single,
                author,
                derived_author,
                merged_country_groups_single,
                merged_modality,
                downloads,
                model
            FROM {view}
            {time_clause}
        ),

        total_downloads_cte AS (
            SELECT SUM(downloads) AS total_downloads_all
            FROM base_data
        ),

        top_items AS (
            SELECT 
                b.group_key AS name,
                SUM(b.downloads) AS total_downloads,
                ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total,
                ANY_VALUE(b.org_country_single) AS org_country_single,
                ANY_VALUE(b.author) AS author,
                ANY_VALUE(b.derived_author) AS derived_author,
                ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single,
                ANY_VALUE(b.merged_modality) AS merged_modality,
                ANY_VALUE(b.model) AS model
            FROM base_data b
            CROSS JOIN total_downloads_cte t
            GROUP BY b.group_key, t.total_downloads_all
        )

        SELECT *
        FROM top_items
        ORDER BY total_downloads DESC
        LIMIT {top_n};
        """

    try:
        return con.execute(query).fetchdf()
    except Exception as e:
        print(f"Error querying DuckDB: {e}")
        return pd.DataFrame()


def format_large_number(n):
    """Shorten large numbers, e.g. 5,000,000 -> '5 million'."""
    if n >= 1_000_000_000:
        return f"{n / 1_000_000_000:.1f} billion"
    elif n >= 1_000_000:
        return f"{n / 1_000_000:.1f} million"
    elif n >= 1_000:
        return f"{n / 1_000:.1f}k"
    else:
        return str(int(n))