emsesc commited on
Commit
fd38574
·
0 Parent(s):

initial charts

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv/*
2
+ __pycache__/*
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import packages
2
+ from dash import Dash, html, dcc, callback, Input, Output
3
+ import pandas as pd
4
+ import pickle
5
+ import plotly.express as px
6
+ from graphs.model_market_share import create_plotly_stacked_area_chart
7
+ from graphs.model_characteristics import create_plotly_language_concentration_chart
8
+
9
+ # Incorporate data
10
+ df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder2007.csv')
11
+
12
+ # Initialize the app
13
+ app = Dash()
14
+
15
+ # Load all pickle files in data_frames/ as loop
16
+ with open('data_frames/model_topk_df.pkl', 'rb') as f:
17
+ model_topk_df = pickle.load(f)
18
+ with open('data_frames/model_gini_df.pkl', 'rb') as f:
19
+ model_gini_df = pickle.load(f)
20
+ with open('data_frames/model_hhi_df.pkl', 'rb') as f:
21
+ model_hhi_df = pickle.load(f)
22
+ with open('data_frames/language_concentration_df.pkl', 'rb') as f:
23
+ language_concentration_df = pickle.load(f)
24
+ with open('data_frames/download_license_cumsum_df.pkl', 'rb') as f:
25
+ license_concentration_df = pickle.load(f)
26
+
27
+ TEMP_MODEL_EVENTS = {
28
+ # "Yolo World Mirror": "2024-03-01",
29
+ "Llama 3": "2024-04-17",
30
+ "Stable Cascade": "2024-02-02",
31
+ "Stable Diffusion 3": "2024-05-30",
32
+ # "embed/upscale": "2023-03-24",
33
+ "DeepSeek-R1": "2025-01-20",
34
+ "Gemma-3 12B QAT": "2025-04-15", # gemma-3-12b-it-qat-4bit
35
+ # "Qwen": "2025-03-05",
36
+ # "Flux RedFlux": "2025-04-12",
37
+ # "DeepSeek-V3": "2025-03-24",
38
+ # "bloom": "2022-05-19",
39
+ "DALLE2-PyTorch": "2022-06-25",
40
+ "Stable Diffusion": "2022-08-10",
41
+ "CLIP ViT": "2021-01-05",
42
+ "YOLOv8": "2023-04-26",
43
+ "Sentence Transformer MiniLM v2": "2021-08-30",
44
+ }
45
+
46
+ PALETTE_0 = [
47
+ "#335C67",
48
+ "#FFF3B0",
49
+ "#E09F3E",
50
+ "#9E2A2B",
51
+ "#540B0E"
52
+ ]
53
+
54
+ fig = create_plotly_stacked_area_chart(
55
+ model_topk_df, model_gini_df, model_hhi_df, TEMP_MODEL_EVENTS, PALETTE_0
56
+ )
57
+
58
+ LANG_SEGMENT_ORDER = [
59
+ 'Monolingual: EN', 'Monolingual: HR', 'Monolingual: M/LR',
60
+ 'Multilingual: HR', 'Multilingual', 'Unknown',
61
+ ]
62
+ fig2 = create_plotly_language_concentration_chart(
63
+ language_concentration_df, 'time', 'metric', 'value', LANG_SEGMENT_ORDER, PALETTE_0
64
+ )
65
+
66
+ LICENSE_SEGMENT_ORDER = [
67
+ "Open Use", "Open Use (Acceptable Use Policy)", "Open Use (Non-Commercial Only)", "Attribution",
68
+ "Acceptable Use Policy", "Non-Commercial Only", "Undocumented", "Undocumented (Acceptable Use Policy)",
69
+ ]
70
+ fig3 = create_plotly_language_concentration_chart(
71
+ license_concentration_df, 'period', 'status', 'percent', LICENSE_SEGMENT_ORDER, PALETTE_0
72
+ )
73
+
74
+ # Make global font family
75
+ fig.update_layout(font_family="Inter")
76
+ fig2.update_layout(font_family="Inter")
77
+ fig3.update_layout(font_family="Inter")
78
+
79
+ # App layout
80
+ app.layout = html.Div(
81
+ [
82
+ html.Div(children='Visualizing the Open Model Ecosystem', style={'fontSize': 28, 'fontWeight': 'bold', 'marginBottom': 10}),
83
+ html.Div(children='An interactive dashboard to explore trends in open models on Hugging Face', style={'fontSize': 16, 'marginBottom': 20}),
84
+ html.Hr(),
85
+ dcc.Tabs([
86
+ dcc.Tab(label='Model Market Share', children=[
87
+ dcc.Graph(figure=fig, id='stacked-area-chart'),
88
+ ]),
89
+ dcc.Tab(label='Model Characteristics', children=[
90
+ dcc.Graph(id='language-concentration-chart'),
91
+ html.Div([
92
+ dcc.Dropdown(['Language Concentration', 'Architecture', 'License', 'Method'], 'Language Concentration', id='dropdown'),
93
+ ]),
94
+ ]),
95
+ ])
96
+ ],
97
+ style={'fontFamily': 'Inter'}
98
+ )
99
+
100
+ # On dropdown change, update graph
101
+ @app.callback(
102
+ Output('language-concentration-chart', 'figure'),
103
+ [Input('dropdown', 'value')]
104
+ )
105
+ def update_graph(selected_metric):
106
+ if selected_metric == 'Language Concentration':
107
+ return fig2
108
+ elif selected_metric == 'License':
109
+ return fig3
110
+
111
+ # Run the app
112
+ if __name__ == '__main__':
113
+ app.run(debug=True)
data_frames/agg_devsize_downloads.pkl ADDED
Binary file (24.1 kB). View file
 
data_frames/agg_natsize_downloads.pkl ADDED
Binary file (2.09 kB). View file
 
data_frames/dev_gini_df.pkl ADDED
Binary file (5.59 kB). View file
 
data_frames/dev_hhi_df.pkl ADDED
Binary file (5.59 kB). View file
 
data_frames/dev_topk_df.pkl ADDED
Binary file (28.3 kB). View file
 
data_frames/download_arch_cumsum_df.pkl ADDED
Binary file (78.6 kB). View file
 
data_frames/download_license_cumsum_df.pkl ADDED
Binary file (64.5 kB). View file
 
data_frames/download_method_cumsum_df.pkl ADDED
Binary file (78.6 kB). View file
 
data_frames/download_openness_cumsum_df.pkl ADDED
Binary file (29.3 kB). View file
 
data_frames/language_concentration_df.pkl ADDED
Binary file (28.4 kB). View file
 
data_frames/model_gini_df.pkl ADDED
Binary file (5.59 kB). View file
 
data_frames/model_hhi_df.pkl ADDED
Binary file (5.59 kB). View file
 
data_frames/model_topk_df.pkl ADDED
Binary file (28.3 kB). View file
 
data_frames/nat_gini_df.pkl ADDED
Binary file (5.59 kB). View file
 
data_frames/nat_hhi_df.pkl ADDED
Binary file (5.59 kB). View file
 
data_frames/nat_topk_df.pkl ADDED
Binary file (28.3 kB). View file
 
graphs/model_characteristics.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+
3
+ def create_plotly_language_concentration_chart(
4
+ language_concentration_df,
5
+ period_col,
6
+ metric_col,
7
+ value_col,
8
+ LANG_SEGMENT_ORDER,
9
+ PALETTE_0
10
+ ):
11
+ """
12
+ Convert the language concentration visualization to Plotly
13
+ """
14
+
15
+ # Create figure
16
+ fig = go.Figure()
17
+
18
+ # Get unique time periods
19
+ time_periods = sorted(language_concentration_df[period_col].unique())
20
+
21
+ # Create stacked area traces
22
+ for i, metric in enumerate(LANG_SEGMENT_ORDER):
23
+ metric_data = language_concentration_df[language_concentration_df[metric_col] == metric]
24
+
25
+ # Sort by time and get values
26
+ metric_data = metric_data.sort_values(period_col)
27
+ x_vals = metric_data[period_col]
28
+ y_vals = metric_data[value_col]
29
+
30
+ # Add area trace
31
+ fig.add_trace(
32
+ go.Scatter(
33
+ x=x_vals,
34
+ y=y_vals,
35
+ name=metric,
36
+ mode='lines',
37
+ line=dict(width=0),
38
+ fill='tonexty' if i > 0 else 'tozeroy',
39
+ fillcolor=PALETTE_0[i % len(PALETTE_0)],
40
+ stackgroup='one',
41
+ hovertemplate='<b>%{fullData.name}</b><br>' +
42
+ 'Time: %{x}<br>' +
43
+ 'Value: %{y}<extra></extra>'
44
+ )
45
+ )
46
+
47
+ # Update layout
48
+ fig.update_layout(
49
+ width=1000,
50
+ height=200,
51
+ font_family="Times New Roman",
52
+ font_size=14,
53
+ showlegend=True, # Show legend for language concentration
54
+ legend=dict(
55
+ title="Language Concentration",
56
+ orientation="v",
57
+ yanchor="top",
58
+ y=1,
59
+ xanchor="left",
60
+ x=1.02
61
+ ),
62
+ margin=dict(l=60, r=150, t=40, b=60), # Extra right margin for legend
63
+ plot_bgcolor='white',
64
+ hovermode='x unified'
65
+ )
66
+
67
+ # Update x-axis
68
+ fig.update_xaxes(
69
+ title_text="",
70
+ showgrid=True,
71
+ gridcolor='lightgray',
72
+ gridwidth=1
73
+ )
74
+
75
+ # Update y-axis
76
+ fig.update_yaxes(
77
+ title_text="",
78
+ showgrid=True,
79
+ gridcolor='lightgray',
80
+ gridwidth=1
81
+ )
82
+
83
+ return fig
graphs/model_market_share.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ from plotly.subplots import make_subplots
3
+
4
+ def create_plotly_stacked_area_chart(
5
+ model_topk_df,
6
+ model_gini_df,
7
+ model_hhi_df,
8
+ TEMP_MODEL_EVENTS,
9
+ PALETTE_0
10
+ ):
11
+ """
12
+ Convert the visualization_util stacked area chart to Plotly
13
+ """
14
+
15
+ # Create subplot with secondary y-axis
16
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
17
+
18
+ # Define metric order
19
+ metric_order = ['Top 1', 'Top 1 - 10', 'Top 10 - 100', 'Top 100 - 1000', 'Top 1000 - 10000', 'Rest']
20
+
21
+ # Get unique time periods
22
+ time_periods = sorted(model_topk_df['time'].unique())
23
+
24
+ # Create stacked area traces
25
+ for i, metric in enumerate(metric_order):
26
+ metric_data = model_topk_df[model_topk_df['metric'] == metric]
27
+
28
+ # Sort by time and get values
29
+ metric_data = metric_data.sort_values('time')
30
+ x_vals = metric_data['time']
31
+ y_vals = metric_data['value']
32
+
33
+ # Add area trace
34
+ fig.add_trace(
35
+ go.Scatter(
36
+ x=x_vals,
37
+ y=y_vals,
38
+ name=metric,
39
+ mode='lines',
40
+ line=dict(width=0, color=PALETTE_0[i % len(PALETTE_0)]),
41
+ fill='tonexty' if i > 0 else 'tozeroy',
42
+ fillcolor=PALETTE_0[i % len(PALETTE_0)], # Add opacity
43
+ stackgroup='one',
44
+ hovertemplate='<b>%{fullData.name}</b><br>' +
45
+ 'Time: %{x}<br>' +
46
+ 'Value: %{y}<extra></extra>'
47
+ ),
48
+ secondary_y=False
49
+ )
50
+
51
+ # Add overlay lines
52
+ # Gini Coefficient
53
+ gini_data = model_gini_df.sort_values('time')
54
+ fig.add_trace(
55
+ go.Scatter(
56
+ x=gini_data['time'],
57
+ y=gini_data['value'],
58
+ name='Gini Coefficient',
59
+ mode='lines',
60
+ line=dict(color='#6b46c1', width=3),
61
+ yaxis='y2',
62
+ hovertemplate='<b>Gini Coefficient</b><br>' +
63
+ 'Time: %{x}<br>' +
64
+ 'Value: %{y:.3f}<extra></extra>'
65
+ ),
66
+ secondary_y=True
67
+ )
68
+
69
+ # HHI (×10)
70
+ hhi_data = model_hhi_df.sort_values('time')
71
+ fig.add_trace(
72
+ go.Scatter(
73
+ x=hhi_data['time'],
74
+ y=hhi_data['value'] * 10, # Multiply by 10 as indicated
75
+ name='HHI (×10)',
76
+ mode='lines',
77
+ line=dict(color='#ec4899', width=3),
78
+ yaxis='y2',
79
+ hovertemplate='<b>HHI (×10)</b><br>' +
80
+ 'Time: %{x}<br>' +
81
+ 'Value: %{y:.3f}<extra></extra>'
82
+ ),
83
+ secondary_y=True
84
+ )
85
+
86
+ # Add vertical lines for events
87
+ for event_name, event_date in TEMP_MODEL_EVENTS.items():
88
+ fig.add_shape(
89
+ type="line",
90
+ x0=event_date, x1=event_date,
91
+ y0=0, y1=1,
92
+ yref="paper",
93
+ line=dict(color='#333333', width=2, dash='dash')
94
+ )
95
+
96
+ # Add annotation for the event
97
+ fig.add_annotation(
98
+ x=event_date,
99
+ y=1,
100
+ yref="paper",
101
+ text=event_name,
102
+ showarrow=False,
103
+ yshift=10,
104
+ font=dict(size=12)
105
+ )
106
+
107
+ # Update layout
108
+ fig.update_layout(
109
+ width=1000,
110
+ height=200,
111
+ font_family="Inter",
112
+ font_size=14,
113
+ showlegend=False, # Set to True if you want to show legend
114
+ margin=dict(l=60, r=60, t=40, b=60),
115
+ plot_bgcolor='white',
116
+ hovermode='x unified'
117
+ )
118
+
119
+ # Update x-axis
120
+ fig.update_xaxes(
121
+ title_text="",
122
+ showgrid=True,
123
+ gridcolor='lightgray',
124
+ gridwidth=1
125
+ )
126
+
127
+ # Update primary y-axis (left)
128
+ fig.update_yaxes(
129
+ title_text="Model Market Share",
130
+ showgrid=True,
131
+ gridcolor='lightgray',
132
+ gridwidth=1,
133
+ secondary_y=False
134
+ )
135
+
136
+ # Update secondary y-axis (right)
137
+ fig.update_yaxes(
138
+ title_text="Concentration Indices",
139
+ showgrid=False,
140
+ secondary_y=True
141
+ )
142
+
143
+ return fig
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pandas
2
+ dash
3
+ plotly