File size: 4,823 Bytes
30e273c 928e26f 30e273c 928e26f 30e273c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | #!/usr/bin/env python3
"""Free Language Embeddings — load and query V34 word vectors.
Usage:
python fle.py # interactive mode
python fle.py king - man + woman # single query
python fle.py --similar cat # nearest neighbors
Requires: fle_v34.npz (download from GitHub releases)
"""
import numpy as np
import sys
import os
EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "fle_v34.npz")
class FLE:
"""Free Language Embeddings — 100K words, 300d, V34 dynamic masking word2vec."""
def __init__(self, path=EMBEDDINGS_FILE):
data = np.load(path, allow_pickle=True)
self.embeddings = data["embeddings"] # (100000, 300) float32
self.words = list(data["words"])
self.word2id = {w: i for i, w in enumerate(self.words)}
self._normed = None
@property
def normed(self):
if self._normed is None:
norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
self._normed = self.embeddings / np.maximum(norms, 1e-8)
return self._normed
def __contains__(self, word):
return word in self.word2id
def __getitem__(self, word):
return self.embeddings[self.word2id[word]]
def similar(self, word, n=10):
"""Find n most similar words."""
if word not in self.word2id:
return []
vec = self.normed[self.word2id[word]]
sims = self.normed @ vec
sims[self.word2id[word]] = -1
top = np.argsort(-sims)[:n]
return [(self.words[i], float(sims[i])) for i in top]
def analogy(self, a, b, c, n=5):
"""a is to b as c is to ? (a - b + c)"""
for w in [a, b, c]:
if w not in self.word2id:
return []
vec = self.normed[self.word2id[a]] - self.normed[self.word2id[b]] + self.normed[self.word2id[c]]
vec = vec / (np.linalg.norm(vec) + 1e-8)
sims = self.normed @ vec
for w in [a, b, c]:
sims[self.word2id[w]] = -1
top = np.argsort(-sims)[:n]
return [(self.words[i], float(sims[i])) for i in top]
def similarity(self, a, b):
"""Cosine similarity between two words."""
if a not in self.word2id or b not in self.word2id:
return None
return float(self.normed[self.word2id[a]] @ self.normed[self.word2id[b]])
def query(self, expression):
"""Evaluate a vector arithmetic expression like 'king - man + woman'."""
tokens = expression.strip().split()
if not tokens:
return []
vec = np.zeros(self.embeddings.shape[1])
sign = 1.0
used = set()
for token in tokens:
if token == '+':
sign = 1.0
elif token == '-':
sign = -1.0
elif token in self.word2id:
vec += sign * self.normed[self.word2id[token]]
used.add(token)
sign = 1.0
else:
return [(f"'{token}' not in vocabulary", 0.0)]
vec = vec / (np.linalg.norm(vec) + 1e-8)
sims = self.normed @ vec
for w in used:
sims[self.word2id[w]] = -1
top = np.argsort(-sims)[:10]
return [(self.words[i], float(sims[i])) for i in top]
def main():
if not os.path.exists(EMBEDDINGS_FILE):
print(f"Error: {EMBEDDINGS_FILE} not found.")
print("Download from: https://github.com/ruapotato/Free-Language-Embeddings/releases")
sys.exit(1)
fle = FLE()
print(f"Loaded {len(fle.words):,} words, {fle.embeddings.shape[1]}d")
# CLI mode
if len(sys.argv) > 1:
if sys.argv[1] == "--similar":
word = sys.argv[2] if len(sys.argv) > 2 else "cat"
for w, s in fle.similar(word, 15):
print(f" {w:<20} {s:.4f}")
else:
expr = " ".join(sys.argv[1:])
print(f" {expr}")
for w, s in fle.query(expr):
print(f" → {w:<20} {s:.4f}")
return
# Interactive mode
print("\nExamples:")
print(" king - man + woman")
print(" similar cat")
print(" paris - france + germany")
print()
while True:
try:
line = input("fle> ").strip()
except (EOFError, KeyboardInterrupt):
print()
break
if not line:
continue
if line.startswith("similar "):
word = line.split()[1]
results = fle.similar(word, 15)
if not results:
print(f" '{word}' not in vocabulary")
for w, s in results:
print(f" {w:<20} {s:.4f}")
else:
for w, s in fle.query(line):
print(f" {w:<20} {s:.4f}")
if __name__ == "__main__":
main()
|