Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from transformers import pipeline | |
| from sklearn.cluster import KMeans | |
| import numpy as np | |
| # Mock data | |
| mock_words = [ | |
| "apple", "banana", "cherry", "date", # Fruits | |
| "car", "truck", "bus", "bicycle", # Vehicles | |
| "red", "blue", "green", "yellow", # Colors | |
| "cat", "dog", "rabbit", "hamster" # Pets | |
| ] | |
| # Define available models and load them | |
| models = { | |
| 'DistilBERT': 'distilbert-base-uncased', | |
| 'BERT': 'bert-base-uncased', | |
| 'RoBERTa': 'roberta-base' | |
| } | |
| def load_models(): | |
| pipelines = {} | |
| for name, model_name in models.items(): | |
| pipelines[name] = pipeline('feature-extraction', model=model_name) | |
| return pipelines | |
| pipelines = load_models() | |
| def embed_words(words, model_name): | |
| """ | |
| Embed the given words using the specified model and return the averaged embeddings. | |
| """ | |
| embedder = pipelines[model_name] | |
| embeddings = embedder(words) | |
| return np.array([np.mean(embedding[0], axis=0) for embedding in embeddings]) | |
| def iterative_clustering(words, model_name): | |
| remaining_words = words[:] | |
| grouped_words = [] | |
| while len(remaining_words) >= 4: | |
| embeddings = embed_words(remaining_words, model_name) | |
| kmeans = KMeans(n_clusters=min(4, len(remaining_words) // 4), random_state=0).fit(embeddings) | |
| clusters = {i: [] for i in range(kmeans.n_clusters)} | |
| for word, label in zip(remaining_words, kmeans.labels_): | |
| if len(clusters[label]) < 4: | |
| clusters[label].append(word) | |
| # Select the most cohesive cluster | |
| best_cluster, best_idx = select_most_cohesive_cluster(clusters, kmeans, embeddings) | |
| # Store the best cluster and remove those words | |
| grouped_words.append(best_cluster) | |
| remaining_words = [word for word in remaining_words if word not in best_cluster] | |
| return grouped_words | |
| def select_most_cohesive_cluster(clusters, kmeans_model, embeddings): | |
| min_distance = float('inf') | |
| best_cluster = None | |
| best_idx = -1 | |
| for idx, cluster in clusters.items(): | |
| if len(cluster) == 4: | |
| cluster_embeddings = embeddings[[i for i, label in enumerate(kmeans_model.labels_) if label == idx]] | |
| centroid = kmeans_model.cluster_centers_[idx] | |
| distance = np.mean(np.linalg.norm(cluster_embeddings - centroid, axis=1)) | |
| if distance < min_distance: | |
| min_distance = distance | |
| best_cluster = cluster | |
| best_idx = idx | |
| return best_cluster, best_idx | |
| def display_clusters(clusters): | |
| for i, words in enumerate(clusters): | |
| st.markdown(f"### Group {i+1}") | |
| st.write(", ".join(words)) | |
| def main(): | |
| st.title("NYT Connections Solver") | |
| st.write("This app demonstrates solving the NYT Connections game using word embeddings and clustering.") | |
| st.write("Select an embedding model from the dropdown menu and click 'Generate Clusters' to see the grouped words.") | |
| # Dropdown menu for selecting the embedding model | |
| model_name = st.selectbox("Select Embedding Model", list(models.keys())) | |
| if st.button("Generate Clusters"): | |
| with st.spinner("Generating clusters..."): | |
| clusters = iterative_clustering(mock_words, model_name) | |
| display_clusters(clusters) | |
| if __name__ == "__main__": | |
| main() | |