hackersgame's picture
Upload fle.py with huggingface_hub
928e26f verified
#!/usr/bin/env python3
"""Free Language Embeddings — load and query V34 word vectors.
Usage:
python fle.py # interactive mode
python fle.py king - man + woman # single query
python fle.py --similar cat # nearest neighbors
Requires: fle_v34.npz (download from GitHub releases)
"""
import numpy as np
import sys
import os
EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "fle_v34.npz")
class FLE:
"""Free Language Embeddings — 100K words, 300d, V34 dynamic masking word2vec."""
def __init__(self, path=EMBEDDINGS_FILE):
data = np.load(path, allow_pickle=True)
self.embeddings = data["embeddings"] # (100000, 300) float32
self.words = list(data["words"])
self.word2id = {w: i for i, w in enumerate(self.words)}
self._normed = None
@property
def normed(self):
if self._normed is None:
norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
self._normed = self.embeddings / np.maximum(norms, 1e-8)
return self._normed
def __contains__(self, word):
return word in self.word2id
def __getitem__(self, word):
return self.embeddings[self.word2id[word]]
def similar(self, word, n=10):
"""Find n most similar words."""
if word not in self.word2id:
return []
vec = self.normed[self.word2id[word]]
sims = self.normed @ vec
sims[self.word2id[word]] = -1
top = np.argsort(-sims)[:n]
return [(self.words[i], float(sims[i])) for i in top]
def analogy(self, a, b, c, n=5):
"""a is to b as c is to ? (a - b + c)"""
for w in [a, b, c]:
if w not in self.word2id:
return []
vec = self.normed[self.word2id[a]] - self.normed[self.word2id[b]] + self.normed[self.word2id[c]]
vec = vec / (np.linalg.norm(vec) + 1e-8)
sims = self.normed @ vec
for w in [a, b, c]:
sims[self.word2id[w]] = -1
top = np.argsort(-sims)[:n]
return [(self.words[i], float(sims[i])) for i in top]
def similarity(self, a, b):
"""Cosine similarity between two words."""
if a not in self.word2id or b not in self.word2id:
return None
return float(self.normed[self.word2id[a]] @ self.normed[self.word2id[b]])
def query(self, expression):
"""Evaluate a vector arithmetic expression like 'king - man + woman'."""
tokens = expression.strip().split()
if not tokens:
return []
vec = np.zeros(self.embeddings.shape[1])
sign = 1.0
used = set()
for token in tokens:
if token == '+':
sign = 1.0
elif token == '-':
sign = -1.0
elif token in self.word2id:
vec += sign * self.normed[self.word2id[token]]
used.add(token)
sign = 1.0
else:
return [(f"'{token}' not in vocabulary", 0.0)]
vec = vec / (np.linalg.norm(vec) + 1e-8)
sims = self.normed @ vec
for w in used:
sims[self.word2id[w]] = -1
top = np.argsort(-sims)[:10]
return [(self.words[i], float(sims[i])) for i in top]
def main():
if not os.path.exists(EMBEDDINGS_FILE):
print(f"Error: {EMBEDDINGS_FILE} not found.")
print("Download from: https://github.com/ruapotato/Free-Language-Embeddings/releases")
sys.exit(1)
fle = FLE()
print(f"Loaded {len(fle.words):,} words, {fle.embeddings.shape[1]}d")
# CLI mode
if len(sys.argv) > 1:
if sys.argv[1] == "--similar":
word = sys.argv[2] if len(sys.argv) > 2 else "cat"
for w, s in fle.similar(word, 15):
print(f" {w:<20} {s:.4f}")
else:
expr = " ".join(sys.argv[1:])
print(f" {expr}")
for w, s in fle.query(expr):
print(f" → {w:<20} {s:.4f}")
return
# Interactive mode
print("\nExamples:")
print(" king - man + woman")
print(" similar cat")
print(" paris - france + germany")
print()
while True:
try:
line = input("fle> ").strip()
except (EOFError, KeyboardInterrupt):
print()
break
if not line:
continue
if line.startswith("similar "):
word = line.split()[1]
results = fle.similar(word, 15)
if not results:
print(f" '{word}' not in vocabulary")
for w, s in results:
print(f" {w:<20} {s:.4f}")
else:
for w, s in fle.query(line):
print(f" {w:<20} {s:.4f}")
if __name__ == "__main__":
main()