Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| from scipy.spatial.distance import cosine | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from sklearn.decomposition import PCA | |
| # --- Simulate a small pre-trained Word2Vec model --- | |
| # Dummy word vectors for demonstration (4D for richer visualization) | |
| dummy_word_vectors = { | |
| 'cat': np.array([0.9, 0.7, 0.1, 0.2]), | |
| 'dog': np.array([0.8, 0.8, 0.3, 0.1]), | |
| 'kitten': np.array([0.85, 0.75, 0.15, 0.25]), | |
| 'puppy': np.array([0.75, 0.85, 0.25, 0.15]), | |
| 'fish': np.array([0.1, 0.2, 0.9, 0.8]), | |
| 'bird': np.array([0.2, 0.1, 0.8, 0.9]), | |
| 'ocean': np.array([0.05, 0.15, 0.95, 0.85]), | |
| 'sky': np.array([0.25, 0.05, 0.85, 0.95]), | |
| 'run': np.array([0.6, 0.3, 0.1, 0.1]), | |
| 'walk': np.array([0.55, 0.35, 0.15, 0.05]), | |
| 'jump': np.array([0.65, 0.25, 0.05, 0.15]), | |
| 'king': np.array([0.9, 0.1, 0.1, 0.8]), | |
| 'queen': np.array([0.8, 0.2, 0.2, 0.9]), | |
| 'man': np.array([0.9, 0.15, 0.05, 0.7]), | |
| 'woman': np.array([0.85, 0.1, 0.15, 0.85]), | |
| 'prince': np.array([0.88, 0.12, 0.12, 0.82]), | |
| 'princess': np.array([0.83, 0.18, 0.18, 0.88]) | |
| } | |
| # Normalize vectors (important for cosine similarity) | |
| for word, vec in dummy_word_vectors.items(): | |
| dummy_word_vectors[word] = vec / np.linalg.norm(vec) | |
| # --- Function to find nearest neighbors and generate plot --- | |
| def find_nearest_neighbors_and_plot(search_word_input): | |
| search_word = search_word_input.lower() | |
| if search_word not in dummy_word_vectors: | |
| return ( | |
| None, # No plot | |
| pd.DataFrame([{"Message": f"'{search_word}' not found in our dummy vocabulary. Try one of these: {', '.join(list(dummy_word_vectors.keys()))}"}]), | |
| "Warning: Word not found!" | |
| ) | |
| target_vector = dummy_word_vectors[search_word] | |
| similarities = [] | |
| # Collect words and vectors for PCA | |
| words_to_plot = [search_word] | |
| vectors_to_plot = [target_vector] | |
| for word, vector in dummy_word_vectors.items(): | |
| if word != search_word: # Don't compare a word to itself | |
| similarity = 1 - cosine(target_vector, vector) | |
| similarities.append({"Word": word, "Cosine Similarity": similarity}) | |
| results_df = pd.DataFrame(similarities).sort_values( | |
| by="Cosine Similarity", ascending=False | |
| ).reset_index(drop=True) | |
| # Add top N neighbors to plot (e.g., top 5) | |
| top_n = 5 | |
| for _, row in results_df.head(top_n).iterrows(): | |
| words_to_plot.append(row["Word"]) | |
| vectors_to_plot.append(dummy_word_vectors[row["Word"]]) | |
| # Convert to numpy array for PCA | |
| vectors_array = np.array(vectors_to_plot) | |
| # Perform PCA to reduce to 2 dimensions for plotting | |
| pca = PCA(n_components=2) | |
| # Fit PCA on all dummy vectors first to get a consistent mapping | |
| # This helps keep the relative positions meaningful across different searches. | |
| all_vectors_array = np.array(list(dummy_word_vectors.values())) | |
| pca.fit(all_vectors_array) | |
| # Transform only the selected vectors | |
| transformed_vectors = pca.transform(vectors_array) | |
| # Create the plot | |
| fig, ax = plt.subplots(figsize=(8, 8)) | |
| # Plot all words in the dummy vocabulary as light grey points | |
| # to provide some context for the PCA space | |
| all_transformed_vectors = pca.transform(all_vectors_array) | |
| all_words = list(dummy_word_vectors.keys()) | |
| for i, word in enumerate(all_words): | |
| ax.scatter(all_transformed_vectors[i, 0], all_transformed_vectors[i, 1], | |
| color='lightgray', alpha=0.5, s=50) | |
| ax.text(all_transformed_vectors[i, 0] + 0.01, all_transformed_vectors[i, 1] + 0.01, word, | |
| fontsize=8, color='darkgray') | |
| # Plot selected words | |
| for i, word in enumerate(words_to_plot): | |
| x, y = transformed_vectors[i] | |
| color = 'red' if word == search_word else 'blue' | |
| marker = 'D' if word == search_word else 'o' # Diamond for search word | |
| ax.scatter(x, y, color=color, label=word, marker=marker, s=150 if word == search_word else 100, edgecolor='black', zorder=5) | |
| ax.text(x + 0.01, y + 0.01, word, fontsize=10, weight='bold' if word == search_word else 'normal', color=color, zorder=6) | |
| # Draw vector from origin to point (simulating conceptual vectors) | |
| ax.plot([0, x], [0, y], color=color, linestyle='--', linewidth=1, alpha=0.7) | |
| # Draw arrows from search word to its neighbors (optional, but good for intuition) | |
| search_word_x, search_word_y = transformed_vectors[0] | |
| for i in range(1, len(transformed_vectors)): | |
| neighbor_x, neighbor_y = transformed_vectors[i] | |
| # Calculate angle and display for top 1 | |
| if i == 1: # Only for the closest neighbor | |
| vec1 = transformed_vectors[0] - np.array([0,0]) # Vector from origin to search word | |
| vec2 = transformed_vectors[i] - np.array([0,0]) # Vector from origin to neighbor | |
| # Use original 4D vectors for actual cosine similarity calculation | |
| original_vec1 = target_vector | |
| original_vec2 = dummy_word_vectors[words_to_plot[i]] | |
| sim_val = 1 - cosine(original_vec1, original_vec2) | |
| angle_rad = np.arccos(np.clip(sim_val, -1.0, 1.0)) # Clip to handle potential float precision issues | |
| angle_deg = np.degrees(angle_rad) | |
| ax.annotate(f"{angle_deg:.1f}°", xy=((vec1[0]+vec2[0])/2, (vec1[1]+vec2[1])/2), | |
| xytext=(search_word_x + 0.05, search_word_y + 0.05), | |
| arrowprops=dict(facecolor='black', shrink=0.05, width=0.5, headwidth=5), | |
| fontsize=9, color='green', weight='bold') | |
| ax.set_title(f"2D Projection of '{search_word}' and its Nearest Neighbors") | |
| ax.set_xlabel(f"PCA Component 1 (explains {pca.explained_variance_ratio_[0]*100:.1f}%)") | |
| ax.set_ylabel(f"PCA Component 2 (explains {pca.explained_variance_ratio_[1]*100:.1f}%)") | |
| ax.grid(True, linestyle=':', alpha=0.6) | |
| ax.axhline(0, color='gray', linewidth=0.5) | |
| ax.axvline(0, color='gray', linewidth=0.5) | |
| ax.set_aspect('equal', adjustable='box') | |
| plt.tight_layout() | |
| # Format the DataFrame for better display in Gradio | |
| results_df["Cosine Similarity"] = results_df["Cosine Similarity"].round(4) | |
| results_df.columns = ["Neighbor Word", "Similarity Score"] # Rename for UI clarity | |
| message = f"Found nearest neighbors for '{search_word}'! " \ | |
| f"Red diamond is the search word, blue circles are its closest neighbors. " \ | |
| f"The angle annotation shows the angle between the search word and its closest neighbor." | |
| return fig, results_df, message | |
| # --- Gradio Interface --- | |
| iface = gr.Interface( | |
| fn=find_nearest_neighbors_and_plot, | |
| inputs=gr.Textbox( | |
| label="Enter a word to explore its neighbors:", | |
| placeholder="e.g., cat, king, fish" | |
| ), | |
| outputs=[ | |
| gr.Plot(label="Word Vector Visualization (PCA 2D)"), | |
| gr.DataFrame( | |
| headers=["Neighbor Word", "Similarity Score"], | |
| row_count=5, # Display up to 5 rows by default | |
| wrap=True, | |
| interactive=False, | |
| label="Nearest Neighbors" | |
| ), | |
| gr.Markdown( | |
| label="Status" | |
| ) | |
| ], | |
| title="🚀 Word Vector Explorer: Visualize & Understand Cosine Similarity!", | |
| description=( | |
| "Type a word to see its nearest semantic neighbors in the vector space, along with a 2D visualization! " | |
| "The angle between vectors on the plot is a visual representation of **Cosine Similarity** " | |
| "(smaller angle = higher similarity). " | |
| "<br>_Note: This POC uses dummy 4D word vectors projected to 2D using PCA. " | |
| "In a full version, this would connect to a large pre-trained Word2Vec model!_" | |
| ), | |
| allow_flagging="never", # Optional: disables the "Flag" button | |
| examples=[ | |
| ["cat"], | |
| ["king"], | |
| ["fish"], | |
| ["run"] | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |