| #!/usr/bin/env python3 | |
| import gensim.downloader as api | |
| import numpy as np | |
| # Load the pre-trained Word2Vec model (downloads ~1.6 GB on first run) | |
| model = api.load('word2vec-google-news-300') | |
| # Function to get embedding and L2 norm for a word or phrase | |
| def get_word_norm(word): | |
| # Handle multi-word phrases by splitting and averaging vectors | |
| subwords = word.split() # Split on spaces | |
| vectors = [] | |
| for sub in subwords: | |
| if sub in model: | |
| vectors.append(model[sub]) | |
| else: | |
| print(f"Warning: '{sub}' not in vocabulary.") | |
| if not vectors: | |
| raise ValueError(f"No vectors found for '{word}' in model.") | |
| # Average the vectors for multi-word | |
| embedding = np.mean(vectors, axis=0) | |
| l2_norm = np.linalg.norm(embedding) # L2 norm (Euclidean norm) | |
| return l2_norm | |
| # Function to score category-likeness (higher = more category-like) | |
| def category_score(norm, scale_factor=1.0): | |
| return scale_factor / norm # Simple inverse; adjust scale_factor if needed | |
| # Example words/phrases | |
| words = ['animal', 'cat', 'mammal', 'siamese', 'thing', 'eiffel tower'] # From general to specific | |
| # Compute norms and scores | |
| results = {} | |
| for word in words: | |
| try: | |
| norm = get_word_norm(word) | |
| score = category_score(norm) | |
| results[word] = {'norm': norm, 'score': score} | |
| except ValueError as e: | |
| print(e) | |
| continue | |
| # Print results (sorted by score descending) | |
| sorted_results = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True) | |
| for word, data in sorted_results: | |
| print(f"Word: {word}\tNorm: {data['norm']:.4f}\tCategory Score: {data['score']:.4f}") | |