Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| import math | |
| from scipy.cluster.hierarchy import dendrogram, linkage | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import itertools | |
| import plotly.figure_factory as ff | |
| from community import community_louvain | |
| import networkx as nx | |
| from sklearn.metrics.pairwise import cosine_distances | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.cluster import AgglomerativeClustering | |
| from wordcloud import WordCloud | |
| import plotly.graph_objects as go | |
| def create_dendrogram(X, labels): | |
| Z = linkage(X.toarray(), "single") | |
| fig = ff.create_dendrogram(Z, orientation='left', labels=labels) | |
| return fig | |
| def load_data(): | |
| data = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv") | |
| return data | |
| df = pd.read_csv("HuggingFaceLLMsWithParamsAndReadmeLinks.csv") | |
| st.title("Constellation: An Atlas of 15,000 Large Language Models") | |
| st.write("15,821 to be precise. Scraped from Hugging Face on July 18, 2023.") | |
| st.write("Please cite: Gao, S., & Gao, A. K. (2023, July 19). On the Origin of LLMs: An Evolutionary Tree and Graph for 15,821 Large Language Models. ArXiv.org; ArXiv. https://doi.org/10.48550/arXiv.2307.09793") | |
| threshold = st.number_input("Enter the minimum number of downloads an LLM must have to be considered.", value=10000) | |
| numClusters = st.number_input("Number of clusters to group into.", value=20, min_value=2, max_value=50) | |
| wordClouds = st.checkbox("Show word clouds?") | |
| def create_downloads_vs_likes_scatter(dataframe): | |
| # Convert 'likes' column to numeric values | |
| dataframe['likes'] = pd.to_numeric(dataframe['likes'], errors='coerce') | |
| # Filter out the outlier point at 14M likes | |
| dataframe_filtered = dataframe[dataframe['likes'] != 14000000] | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=dataframe_filtered['downloads'], y=dataframe_filtered['likes'], mode='markers', | |
| marker=dict(color='blue', size=7, opacity=0.7), | |
| text=dataframe_filtered['model_name'], | |
| hovertemplate="Model Name: %{text}<br>Downloads: %{x}<br>Likes: %{y}<extra></extra>")) | |
| fig.update_layout(title='Downloads vs Likes', | |
| xaxis_title='Downloads', | |
| #xaxis_range=[0,300000], | |
| yaxis_title='Likes') | |
| #yaxis_range=[0, 800]) # Set custom y-axis range | |
| return fig | |
| if st.button("Run Clustering"): | |
| df_filtered = df[df['downloads'] > threshold] | |
| df_extra_filtered = df_filtered.drop_duplicates(subset='model_name', keep='first') | |
| # Convert the model names into a matrix of TF-IDF features | |
| vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8)) | |
| X = vectorizer.fit_transform(df_extra_filtered['model_name'].tolist()).toarray() | |
| # Function to compute the pairwise cosine distances | |
| def distfun(X): | |
| return cosine_distances(X) | |
| # Function to compute the linkage matrix | |
| def linkagefun(dist_array): | |
| return linkage(dist_array, "single") | |
| # Create dendrogram | |
| fig = ff.create_dendrogram(X, orientation='bottom', labels=df_extra_filtered['model_name'].tolist(), distfun=distfun, linkagefun=linkagefun) | |
| #fig.update_layout(width=800, height=500) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Group by cluster | |
| # Convert the model names into a matrix of token counts | |
| vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6)) | |
| X = vectorizer.fit_transform(df_extra_filtered['model_name']) | |
| # Use clustering to group model names | |
| clustering = AgglomerativeClustering(n_clusters=20).fit(X.toarray()) | |
| # Add cluster labels to the filtered DataFrame | |
| df_extra_filtered['cluster'] = clustering.labels_ | |
| # Count the number of models in each cluster | |
| cluster_counts = df_extra_filtered['cluster'].value_counts() | |
| # Create a bar chart | |
| fig = go.Figure([go.Bar(x=cluster_counts.index, y=cluster_counts.values)]) | |
| fig.update_layout(title='Number of Models per Cluster', xaxis_title='Cluster', yaxis_title='Number of Models') | |
| st.plotly_chart(fig) | |
| # graphing! | |
| # Convert the model names into a matrix of TF-IDF features | |
| vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 8)) | |
| X = vectorizer.fit_transform(df_extra_filtered['model_name']) | |
| # Compute the pairwise cosine similarities | |
| sim_matrix = cosine_similarity(X) | |
| # Create a graph | |
| G = nx.Graph() | |
| # Add nodes to the graph | |
| for i in range(len(df_extra_filtered)): | |
| G.add_node(i, label=df_extra_filtered['model_name'].iloc[i]) | |
| # Add edges to the graph | |
| for i in range(len(df_extra_filtered)): | |
| for j in range(i+1, len(df_extra_filtered)): | |
| # If the similarity is above a certain threshold | |
| if sim_matrix[i, j] > 0.2: | |
| G.add_edge(i, j, weight=sim_matrix[i, j]) | |
| # Compute the layout positions | |
| pos = nx.spring_layout(G) | |
| # Detect communities | |
| partition = community_louvain.best_partition(G) | |
| # Create a figure | |
| # Compute the layout for each community | |
| layouts = {} | |
| for community in set(partition.values()): | |
| nodes_in_community = [node for node, comm in partition.items() if comm == community] | |
| subgraph = G.subgraph(nodes_in_community) | |
| layouts[community] = nx.spring_layout(subgraph) | |
| # Combine the layouts, spreading them out on a grid | |
| grid_size = math.ceil(math.sqrt(len(layouts))) # Size of the grid | |
| grid = np.array(list(itertools.product(range(grid_size), repeat=2))) # Coordinates for the grid | |
| scale = 2 # Scale factor for spreading out the communities | |
| offsets = dict(zip(layouts, grid*scale)) # Map communities to grid coordinates | |
| combined_layout = {} | |
| for community, layout in layouts.items(): | |
| for node, position in layout.items(): | |
| combined_layout[node] = position + offsets[community] | |
| # Prepare data for plotly | |
| x = [combined_layout[node][0] for node in range(len(df_extra_filtered))] | |
| y = [combined_layout[node][1] for node in range(len(df_extra_filtered))] | |
| # Create a figure | |
| fig = go.Figure() | |
| # Prepare lists for node positions, labels, ranks, downloads, likes, and params | |
| x, y, labels, ranks, downloads, likes, params = [], [], [], [], [], [], [] | |
| # Prepare the node attributes | |
| for node, community in partition.items(): | |
| # Get model info | |
| model_info = df_extra_filtered.iloc[node] | |
| # Node position | |
| x.append(pos[node][0]) | |
| y.append(pos[node][1]) | |
| # Node attributes | |
| labels.append(model_info['model_name']) | |
| ranks.append(model_info['rank']) | |
| downloads.append(model_info['downloads']) | |
| likes.append(model_info['likes']) | |
| params.append(model_info['params_millions'] if pd.notnull(model_info['params_millions']) else 'N/A') | |
| # Compute the centroid of each cluster for background coloring | |
| centroids = dict() | |
| community_sizes = dict() # Create a dict to store the sizes of each community | |
| for community in set(partition.values()): | |
| nodes_in_community = [node for node, comm in partition.items() if comm == community] | |
| if len(nodes_in_community) > 1: # Only consider communities with more than one node | |
| centroid_x = np.mean([pos[node][0] for node in nodes_in_community]) | |
| centroid_y = np.mean([pos[node][1] for node in nodes_in_community]) | |
| centroids[community] = (centroid_x, centroid_y) | |
| community_sizes[community] = len(nodes_in_community) | |
| # Add background coloring for each cluster | |
| for community, centroid in centroids.items(): | |
| fig.add_trace(go.Scatter( | |
| x=[centroid[0]], y=[centroid[1]], | |
| mode='markers', | |
| marker=dict( | |
| size=community_sizes[community]*5, # Adjust size by multiplying the community size by a factor | |
| color=community, | |
| opacity=0.1 | |
| ), | |
| hoverinfo='none', | |
| showlegend=False | |
| )) | |
| # Add nodes to the figure | |
| fig.add_trace(go.Scatter( | |
| x=x, y=y, | |
| mode='markers', | |
| marker=dict(size=3, color=community), | |
| text=labels, | |
| customdata=np.stack((ranks, downloads, likes, params), axis=-1), | |
| hovertemplate=( | |
| "Model Name: %{text}<br>" | |
| "Rank: %{customdata[0]}<br>" | |
| "Downloads: %{customdata[1]}<br>" | |
| "Likes: %{customdata[2]}<br>" | |
| "Params (millions): %{customdata[3]}" | |
| "<extra></extra>" | |
| ) | |
| )) | |
| # Add edges to the figure | |
| for edge in G.edges(): | |
| # Calculate edge weight for line width, normalize it for better visibility | |
| line_width = G.edges[edge]['weight'] / np.max(list(nx.get_edge_attributes(G, 'weight').values())) | |
| fig.add_trace(go.Scatter( | |
| x=[pos[edge[0]][0], pos[edge[1]][0]], | |
| y=[pos[edge[0]][1], pos[edge[1]][1]], | |
| mode='lines', | |
| line=dict(width=line_width), # Multiply by a factor for better visibility | |
| hoverinfo='none' | |
| )) | |
| # Set the figure layout | |
| fig.update_layout(showlegend=False, hovermode='closest') | |
| st.plotly_chart(fig) | |
| # Calculate degree of each node | |
| degrees = dict(G.degree()) | |
| # Sort nodes by degree in descending order and get top 20 | |
| top_20_models = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:20] | |
| # Prepare data for display | |
| models = [df_extra_filtered.iloc[node]['model_name'] for node, degree in top_20_models] | |
| connections = [degree for node, degree in top_20_models] | |
| st.subheader("Top 20 Models by Number of Connections") | |
| for model, connections in zip(models, connections): | |
| st.write(f"{model}: {connections} connections") | |
| # Find the representative model for each community | |
| representatives = dict() | |
| for community in set(partition.values()): | |
| nodes_in_community = [node for node, comm in partition.items() if comm == community] | |
| # Select the node with the highest degree within the community as representative | |
| representative = max(nodes_in_community, key=lambda node: degrees[node]) | |
| representatives[community] = df_extra_filtered.iloc[representative]['model_name'] | |
| # Prepare data for display | |
| communities = list(representatives.keys()) | |
| community_sizes = [community_sizes.get(comm, 1) for comm in communities] # Use a default size of 1 for communities not in the dictionary | |
| representatives = list(representatives.values()) | |
| # Create a DataFrame to hold the data | |
| df_reps = pd.DataFrame({ | |
| 'Community ID': communities, | |
| 'Size': community_sizes, | |
| 'Representative Model': representatives | |
| }) | |
| # Sort the DataFrame by community size in descending order | |
| df_reps.sort_values(by='Size', ascending=False, inplace=True) | |
| # Display in Streamlit | |
| st.subheader("Representative for each community, sorted by community size.") | |
| st.dataframe(df_reps) | |
| if wordClouds: | |
| groups = df_extra_filtered.groupby('cluster') | |
| for name, group in groups: | |
| # Join all model names in the cluster into a single string | |
| text = ' '.join(group['model_name']) | |
| # Generate a word cloud | |
| wordcloud = WordCloud().generate(text) | |
| # Convert WordCloud to Image | |
| image = wordcloud.to_image() | |
| # Display the word cloud | |
| st.image(image, use_column_width=True) | |
| st.write(f'Word Cloud for Cluster {name}') | |
| scatter_plot = create_downloads_vs_likes_scatter(df_extra_filtered) | |
| st.plotly_chart(scatter_plot, use_container_width=True) |