Spaces:
Sleeping
Sleeping
| import networkx as nx | |
| from sklearn.cluster import HDBSCAN | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from sklearn.manifold import TSNE | |
| import umap | |
| from sklearn.cluster import KMeans | |
| from adjustText import adjust_text | |
| from constants import high_level_families, primary_families_branches | |
| def filter_languages_by_families(matrix, languages, families): | |
| """ | |
| Filters the languages based on their families. | |
| Parameters: | |
| - languages: list of languages to filter. | |
| - families: list of families to include. | |
| Returns: | |
| - filtered_languages: list of languages that belong to the specified families. | |
| """ | |
| filtered_languages = [ | |
| (i, lang) | |
| for i, lang in enumerate(languages) | |
| if high_level_families[lang] in families | |
| ] | |
| filtered_indices = [i for i, lang in filtered_languages] | |
| filtered_languages = [lang for i, lang in filtered_languages] | |
| filtered_matrix = matrix[np.ix_(filtered_indices, filtered_indices)] | |
| return filtered_matrix, filtered_languages | |
| def get_dynamic_color_map(n_colors): | |
| """ | |
| Generates a dynamic color map with the specified number of colors. | |
| Parameters: | |
| - n_colors: int, the number of distinct colors required. | |
| Returns: | |
| - color_map: list of RGB tuples representing the colors. | |
| """ | |
| cmap = plt.get_cmap("tab20") if n_colors <= 20 else plt.get_cmap("hsv") | |
| color_map = [cmap(i / n_colors) for i in range(n_colors)] | |
| return color_map | |
| def cluster_languages_by_families(languages): | |
| lang_families = [high_level_families[lang] for lang in languages] | |
| legend = sorted(set(lang_families)) | |
| clusters = [legend.index(family) for family in lang_families] | |
| return clusters, legend | |
| def cluster_languages_by_subfamilies(languages): | |
| labels = [ | |
| high_level_families[lang] + f" ({primary_families_branches[lang]})" | |
| for lang in languages | |
| ] | |
| legend = sorted(set(labels)) | |
| clusters = [legend.index(family) for family in labels] | |
| return clusters, legend | |
| def plot_mst( | |
| matrix, | |
| languages, | |
| clusters, | |
| legend=None, | |
| fig_size=(20, 20), | |
| ): | |
| """ | |
| Plots a Minimum Spanning Tree (MST) from a given distance matrix, node labels, and cluster assignments. | |
| Parameters: | |
| - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between nodes. | |
| - labels: list of length N containing the labels for each node. | |
| - clusters: list of length N containing the cluster assignment (or ID) for each node. | |
| """ | |
| # Create an empty undirected graph | |
| G = nx.Graph() | |
| # Number of nodes | |
| N = len(languages) | |
| # Add edges to the graph from the distance matrix. | |
| # Only iterate over the upper triangle of the matrix (i < j) | |
| for i in range(N): | |
| for j in range(i + 1, N): | |
| G.add_edge(i, j, weight=matrix[i, j]) | |
| # Compute the Minimum Spanning Tree using NetworkX's built-in function. | |
| mst = nx.minimum_spanning_tree(G) | |
| # Choose a layout for the MST. Here we use Kamada-Kawai layout which considers edge weights. | |
| pos = nx.kamada_kawai_layout(mst, weight="weight") | |
| # Map each cluster to a color | |
| unique_clusters = sorted(set(clusters)) | |
| cmap = get_dynamic_color_map(len(unique_clusters)) | |
| cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} | |
| node_colors = [cluster_colors.get(cluster) for cluster in clusters] | |
| # Create a figure for plotting. | |
| fig, ax = plt.subplots(figsize=fig_size) | |
| # Draw the MST edges. | |
| nx.draw_networkx_edges(mst, pos, edge_color="gray", ax=ax) | |
| # Draw the nodes with colors corresponding to their clusters. | |
| nx.draw_networkx_nodes( | |
| mst, pos, node_color=node_colors, node_size=100, ax=ax, alpha=0.7 | |
| ) | |
| # Instead of directly drawing labels, we create text objects to adjust them later | |
| texts = [] | |
| for i, label in enumerate(languages): | |
| x, y = pos[i] | |
| texts.append(ax.text(x, y, label, fontsize=10)) | |
| # Adjust text labels to minimize overlap. | |
| # The arrowprops argument can draw arrows from labels to nodes if desired. | |
| adjust_text(texts, expand_text=(1.05, 1.2)) | |
| # Add a legend for clusters | |
| if legend is None: | |
| legend = {cluster: str(cluster) for cluster in unique_clusters} | |
| legend_handles = [ | |
| plt.Line2D( | |
| [0], | |
| [0], | |
| marker="o", | |
| color="w", | |
| markerfacecolor=cluster_colors[cluster], | |
| markersize=10, | |
| alpha=0.7, | |
| label=legend[cluster], | |
| ) | |
| for cluster in unique_clusters | |
| ] | |
| ax.legend(handles=legend_handles, title="Clusters", loc="best") | |
| # Remove axis for clarity. | |
| ax.axis("off") | |
| # ax.set_title(f"Minimum Spanning Tree of Languages ({'Average' if use_average else f'{model}, {dataset}'})") | |
| return fig | |
| def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5): | |
| """ | |
| Clusters languages using a distance matrix and KMeans. | |
| Parameters: | |
| - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. | |
| - n_clusters: int, the number of clusters to form. | |
| Returns: | |
| - filtered_matrix: 2D NumPy array of the filtered distance matrix. | |
| - filtered_languages: list of filtered languages. | |
| - filtered_clusters: list of filtered cluster assignments. | |
| """ | |
| # Perform clustering using KMeans | |
| kmeans_model = KMeans(n_clusters=n_clusters, random_state=23) | |
| clusters = kmeans_model.fit_predict(dist_matrix) | |
| # # Count the number of elements in each cluster | |
| # cluster_counts = np.bincount(clusters) | |
| # # Identify clusters with more than 1 element | |
| # valid_clusters = np.where(cluster_counts > 1)[0] | |
| # # Filter out points belonging to clusters with only 1 element | |
| # valid_indices = np.isin(clusters, valid_clusters) | |
| # filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)] | |
| # filtered_languages = np.array(languages)[valid_indices] | |
| # filtered_clusters = np.array(clusters)[valid_indices] | |
| # return filtered_matrix, filtered_languages, filtered_clusters | |
| return dist_matrix, languages, clusters | |
| def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2): | |
| """ | |
| Clusters languages using a distance matrix and HDBSCAN. | |
| Parameters: | |
| - dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. | |
| - min_cluster_size: int, the minimum size of clusters. | |
| Returns: | |
| - clusters: list of length N containing the cluster assignment (or ID) for each language. | |
| """ | |
| # Perform clustering using HDBSCAN with the precomputed distance matrix | |
| clustering_model = HDBSCAN(metric="precomputed", min_cluster_size=min_cluster_size) | |
| clusters = clustering_model.fit_predict(dist_matrix) | |
| # Filter out points belonging to cluster -1 using NumPy | |
| valid_indices = np.where(clusters != -1)[0] | |
| filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)] | |
| filtered_languages = np.array(languages)[valid_indices] | |
| filtered_clusters = np.array(clusters)[valid_indices] | |
| return filtered_matrix, filtered_languages, filtered_clusters | |
| def plot_distances_tsne( | |
| matrix, | |
| languages, | |
| clusters, | |
| legend=None, | |
| fig_size=(16, 12), | |
| ): | |
| """ | |
| Plots all languages from the distances matrix using t-SNE and colors them by clusters. | |
| """ | |
| tsne = TSNE(n_components=2, random_state=23, metric="precomputed", init="random") | |
| tsne_results = tsne.fit_transform(matrix) | |
| # Map each cluster to a color | |
| unique_clusters = sorted(set(clusters)) | |
| cmap = get_dynamic_color_map(len(unique_clusters)) | |
| cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} | |
| fig, ax = plt.subplots(figsize=fig_size) | |
| scatter = ax.scatter( | |
| tsne_results[:, 0], | |
| tsne_results[:, 1], | |
| c=[cluster_colors[cluster] for cluster in clusters], | |
| alpha=0.7, | |
| ) | |
| # for i, lang in enumerate(languages): | |
| # ax.text(tsne_results[i, 0], tsne_results[i, 1], lang, fontsize=8, alpha=0.8) | |
| # Instead of directly drawing labels, we create text objects to adjust them later | |
| texts = [] | |
| for i, label in enumerate(languages): | |
| x, y = tsne_results[i, 0], tsne_results[i, 1] | |
| texts.append(ax.text(x, y, label, fontsize=10)) | |
| # Adjust text labels to minimize overlap. | |
| # The arrowprops argument can draw arrows from labels to nodes if desired. | |
| adjust_text(texts, expand_text=(1.05, 1.2)) | |
| # Add a legend for clusters | |
| if legend is None: | |
| legend = {cluster: str(cluster) for cluster in unique_clusters} | |
| legend_handles = [ | |
| plt.Line2D( | |
| [0], | |
| [0], | |
| marker="o", | |
| color="w", | |
| markerfacecolor=cluster_colors[cluster], | |
| markersize=10, | |
| label=legend[cluster], | |
| ) | |
| for cluster in unique_clusters | |
| ] | |
| ax.legend(handles=legend_handles, title="Clusters", loc="best") | |
| # ax.set_title( | |
| # f"t-SNE Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})" | |
| # ) | |
| # ax.set_xlabel("t-SNE Dimension 1") | |
| # ax.set_ylabel("t-SNE Dimension 2") | |
| ax.axis("off") | |
| return fig | |
| def plot_distances_umap( | |
| matrix, | |
| languages, | |
| clusters, | |
| legend=None, | |
| fig_size=(16, 12), | |
| ): | |
| """ | |
| Plots all languages from the distances matrix using UMAP and colors them by clusters. | |
| """ | |
| umap_model = umap.UMAP(metric="precomputed", random_state=23) | |
| umap_results = umap_model.fit_transform(matrix) | |
| # Map each cluster to a color | |
| unique_clusters = sorted(set(clusters)) | |
| cmap = get_dynamic_color_map(len(unique_clusters)) | |
| cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} | |
| fig, ax = plt.subplots(figsize=fig_size) | |
| scatter = ax.scatter( | |
| umap_results[:, 0], | |
| umap_results[:, 1], | |
| c=[cluster_colors[cluster] for cluster in clusters], | |
| alpha=0.7, | |
| ) | |
| # for i, lang in enumerate(languages): | |
| # ax.text(umap_results[i, 0], umap_results[i, 1], lang, fontsize=8, alpha=0.8) | |
| # Instead of directly drawing labels, we create text objects to adjust them later | |
| texts = [] | |
| for i, label in enumerate(languages): | |
| x, y = umap_results[i, 0], umap_results[i, 1] | |
| texts.append(ax.text(x, y, label, fontsize=10)) | |
| # Adjust text labels to minimize overlap. | |
| # The arrowprops argument can draw arrows from labels to nodes if desired. | |
| adjust_text(texts, expand_text=(1.05, 1.2)) | |
| # Add a legend for clusters | |
| if legend is None: | |
| legend = {cluster: str(cluster) for cluster in unique_clusters} | |
| legend_handles = [ | |
| plt.Line2D( | |
| [0], | |
| [0], | |
| marker="o", | |
| color="w", | |
| markerfacecolor=cluster_colors[cluster], | |
| markersize=10, | |
| label=legend[cluster], | |
| ) | |
| for cluster in unique_clusters | |
| ] | |
| ax.legend(handles=legend_handles, title="Clusters", loc="best") | |
| # ax.set_title( | |
| # f"UMAP Visualization of Language Distances ({'Average' if use_average else f'{model}, {dataset}'})" | |
| # ) | |
| # ax.set_xlabel("UMAP Dimension 1") | |
| # ax.set_ylabel("UMAP Dimension 2") | |
| ax.axis("off") | |
| return fig | |