File size: 4,823 Bytes
30e273c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928e26f
30e273c
 
 
928e26f
30e273c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
"""Free Language Embeddings — load and query V34 word vectors.

Usage:
    python fle.py                        # interactive mode
    python fle.py king - man + woman     # single query
    python fle.py --similar cat          # nearest neighbors

Requires: fle_v34.npz (download from GitHub releases)
"""

import numpy as np
import sys
import os

EMBEDDINGS_FILE = os.path.join(os.path.dirname(__file__), "fle_v34.npz")


class FLE:
    """Free Language Embeddings — 100K words, 300d, V34 dynamic masking word2vec."""

    def __init__(self, path=EMBEDDINGS_FILE):
        data = np.load(path, allow_pickle=True)
        self.embeddings = data["embeddings"]  # (100000, 300) float32
        self.words = list(data["words"])
        self.word2id = {w: i for i, w in enumerate(self.words)}
        self._normed = None

    @property
    def normed(self):
        if self._normed is None:
            norms = np.linalg.norm(self.embeddings, axis=1, keepdims=True)
            self._normed = self.embeddings / np.maximum(norms, 1e-8)
        return self._normed

    def __contains__(self, word):
        return word in self.word2id

    def __getitem__(self, word):
        return self.embeddings[self.word2id[word]]

    def similar(self, word, n=10):
        """Find n most similar words."""
        if word not in self.word2id:
            return []
        vec = self.normed[self.word2id[word]]
        sims = self.normed @ vec
        sims[self.word2id[word]] = -1
        top = np.argsort(-sims)[:n]
        return [(self.words[i], float(sims[i])) for i in top]

    def analogy(self, a, b, c, n=5):
        """a is to b as c is to ? (a - b + c)"""
        for w in [a, b, c]:
            if w not in self.word2id:
                return []
        vec = self.normed[self.word2id[a]] - self.normed[self.word2id[b]] + self.normed[self.word2id[c]]
        vec = vec / (np.linalg.norm(vec) + 1e-8)
        sims = self.normed @ vec
        for w in [a, b, c]:
            sims[self.word2id[w]] = -1
        top = np.argsort(-sims)[:n]
        return [(self.words[i], float(sims[i])) for i in top]

    def similarity(self, a, b):
        """Cosine similarity between two words."""
        if a not in self.word2id or b not in self.word2id:
            return None
        return float(self.normed[self.word2id[a]] @ self.normed[self.word2id[b]])

    def query(self, expression):
        """Evaluate a vector arithmetic expression like 'king - man + woman'."""
        tokens = expression.strip().split()
        if not tokens:
            return []

        vec = np.zeros(self.embeddings.shape[1])
        sign = 1.0
        used = set()
        for token in tokens:
            if token == '+':
                sign = 1.0
            elif token == '-':
                sign = -1.0
            elif token in self.word2id:
                vec += sign * self.normed[self.word2id[token]]
                used.add(token)
                sign = 1.0
            else:
                return [(f"'{token}' not in vocabulary", 0.0)]

        vec = vec / (np.linalg.norm(vec) + 1e-8)
        sims = self.normed @ vec
        for w in used:
            sims[self.word2id[w]] = -1
        top = np.argsort(-sims)[:10]
        return [(self.words[i], float(sims[i])) for i in top]


def main():
    if not os.path.exists(EMBEDDINGS_FILE):
        print(f"Error: {EMBEDDINGS_FILE} not found.")
        print("Download from: https://github.com/ruapotato/Free-Language-Embeddings/releases")
        sys.exit(1)

    fle = FLE()
    print(f"Loaded {len(fle.words):,} words, {fle.embeddings.shape[1]}d")

    # CLI mode
    if len(sys.argv) > 1:
        if sys.argv[1] == "--similar":
            word = sys.argv[2] if len(sys.argv) > 2 else "cat"
            for w, s in fle.similar(word, 15):
                print(f"  {w:<20} {s:.4f}")
        else:
            expr = " ".join(sys.argv[1:])
            print(f"  {expr}")
            for w, s in fle.query(expr):
                print(f"  → {w:<20} {s:.4f}")
        return

    # Interactive mode
    print("\nExamples:")
    print("  king - man + woman")
    print("  similar cat")
    print("  paris - france + germany")
    print()

    while True:
        try:
            line = input("fle> ").strip()
        except (EOFError, KeyboardInterrupt):
            print()
            break

        if not line:
            continue

        if line.startswith("similar "):
            word = line.split()[1]
            results = fle.similar(word, 15)
            if not results:
                print(f"  '{word}' not in vocabulary")
            for w, s in results:
                print(f"  {w:<20} {s:.4f}")
        else:
            for w, s in fle.query(line):
                print(f"  {w:<20} {s:.4f}")


if __name__ == "__main__":
    main()