theformatisvalid commited on
Commit
d965fd8
·
verified ·
1 Parent(s): c5db1d5

Upload 63 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. core/core_tokenized +0 -0
  2. core/preprocessed_core.jsonl +0 -0
  3. data/analogy.txt +28 -0
  4. data/antonyms.txt +20 -0
  5. data/axis.txt +4 -0
  6. data/nearest_neighbors.txt +6 -0
  7. data/synonyms.txt +11 -0
  8. data/vocab.txt +0 -0
  9. main.py +452 -0
  10. models/w2v_v100_w10_m10_sg0.model +3 -0
  11. models/w2v_v100_w10_m10_sg1.model +3 -0
  12. models/w2v_v100_w10_m5_sg0.model +3 -0
  13. models/w2v_v100_w10_m5_sg1.model +3 -0
  14. models/w2v_v100_w10_m8_sg0.model +3 -0
  15. models/w2v_v100_w10_m8_sg1.model +3 -0
  16. models/w2v_v100_w5_m10_sg0.model +3 -0
  17. models/w2v_v100_w5_m10_sg1.model +3 -0
  18. models/w2v_v100_w5_m5_sg0.model +3 -0
  19. models/w2v_v100_w5_m5_sg1.model +3 -0
  20. models/w2v_v100_w5_m8_sg0.model +3 -0
  21. models/w2v_v100_w5_m8_sg1.model +3 -0
  22. models/w2v_v100_w8_m10_sg0.model +3 -0
  23. models/w2v_v100_w8_m10_sg1.model +3 -0
  24. models/w2v_v100_w8_m5_sg0.model +3 -0
  25. models/w2v_v100_w8_m5_sg1.model +3 -0
  26. models/w2v_v100_w8_m8_sg0.model +3 -0
  27. models/w2v_v100_w8_m8_sg1.model +3 -0
  28. models/w2v_v200_w10_m10_sg0.model +3 -0
  29. models/w2v_v200_w10_m10_sg1.model +3 -0
  30. models/w2v_v200_w10_m5_sg0.model +3 -0
  31. models/w2v_v200_w10_m5_sg1.model +3 -0
  32. models/w2v_v200_w10_m8_sg0.model +3 -0
  33. models/w2v_v200_w10_m8_sg1.model +3 -0
  34. models/w2v_v200_w5_m10_sg0.model +3 -0
  35. models/w2v_v200_w5_m10_sg1.model +3 -0
  36. models/w2v_v200_w5_m5_sg0.model +3 -0
  37. models/w2v_v200_w5_m5_sg1.model +3 -0
  38. models/w2v_v200_w5_m8_sg0.model +3 -0
  39. models/w2v_v200_w5_m8_sg1.model +3 -0
  40. models/w2v_v200_w8_m10_sg0.model +3 -0
  41. models/w2v_v200_w8_m10_sg1.model +3 -0
  42. models/w2v_v200_w8_m5_sg0.model +3 -0
  43. models/w2v_v200_w8_m5_sg1.model +3 -0
  44. models/w2v_v200_w8_m8_sg0.model +3 -0
  45. models/w2v_v200_w8_m8_sg1.model +3 -0
  46. models/w2v_v300_w10_m10_sg0.model +3 -0
  47. models/w2v_v300_w10_m10_sg1.model +3 -0
  48. models/w2v_v300_w10_m5_sg0.model +3 -0
  49. models/w2v_v300_w10_m5_sg1.model +3 -0
  50. models/w2v_v300_w10_m8_sg0.model +3 -0
core/core_tokenized ADDED
The diff for this file is too large to render. See raw diff
 
core/preprocessed_core.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/analogy.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ рубль россия сша доллар
2
+ рубль россия европа евро
3
+ москва россия украина киев
4
+ москва россия сша вашингтон
5
+ москва россия япония токио
6
+ москва россия германия берлин
7
+ москва россия фрг берлин
8
+ москва россия сербия белград
9
+ москва россия китай пекин
10
+ москва россия венгрия будапешт
11
+ москва россия белоруссия минск
12
+ москва россия бельгия брюссель
13
+ москва россия таджикистан душанбе
14
+ москва россия франция париж
15
+ москва россия испания мадрид
16
+ москва россия финляндия хельсинки
17
+ отец мужчина женщина мать
18
+ сын мужчина женщина дочь
19
+ мальчик мужчина женщина девочка
20
+ актёр мужчина женщина актриса
21
+ родственник мужчина женщина родственница
22
+ артист мужчина женщина артистка
23
+ пассажир мужчина женщина пассажирка
24
+ пенсионер мужчина женщина пенсионерка
25
+ певец мужчина женщина певица
26
+ писатель мужчина женщина писательница
27
+ российский россия украина украинский
28
+ российский россия сша американский
data/antonyms.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ вопрос ответ
2
+ мужчина женщина
3
+ мальчик девочка
4
+ полный частичный
5
+ зима лето
6
+ чёрный белый
7
+ плюс минус
8
+ молодой пожилой
9
+ дорогой дешёвый
10
+ рождение смерть
11
+ день ночь
12
+ личный общественный
13
+ близкий дальний
14
+ выше ниже
15
+ свадьба развод
16
+ родиться умереть
17
+ экспорт импорт
18
+ запад восток
19
+ полный пустой
20
+ верхний нижний
data/axis.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ мужчина женщина
2
+ молодой пожилой
3
+ россия украина
4
+ запад восток
data/nearest_neighbors.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ мужчина
2
+ женщина
3
+ россия
4
+ человек
5
+ конфликт
6
+ контакт
data/synonyms.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ страна государство
2
+ найти обнаружить
3
+ сегмент часть фрагмент доля
4
+ разный различный
5
+ спорт физкультура
6
+ союз коалиция
7
+ просто легко
8
+ врач доктор
9
+ питаться есть
10
+ алкоголь спиртное
11
+ письмо сообщение
data/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fasttext
2
+ import streamlit as st
3
+ import numpy as np
4
+ import pandas as pd
5
+ from gensim.models import Word2Vec
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ from collections import Counter
10
+ import os
11
+ import glob
12
+
13
+
14
+ class UnifiedVectorModel:
15
+ def __init__(self, backend_model, model_type="w2v"):
16
+ self.model = backend_model
17
+ self.model_type = model_type.lower()
18
+
19
+ if self.model_type == "w2v":
20
+ self.wv = backend_model.wv
21
+ self.key_to_index = self.wv.key_to_index
22
+ self.vector_size = self.wv.vector_size
23
+ self._words = set(self.wv.key_to_index.keys())
24
+
25
+ elif self.model_type == "ft":
26
+ # Для fasttext-wheel
27
+ self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
28
+ self.vector_size = backend_model.get_dimension()
29
+ self._words = set(self.key_to_index.keys())
30
+ else:
31
+ raise ValueError("model_type must be 'w2v' or 'ft'")
32
+
33
+ def __contains__(self, word):
34
+ return word in self._words
35
+
36
+ def __getitem__(self, word):
37
+ if self.model_type == "w2v":
38
+ return self.wv[word]
39
+ elif self.model_type == "ft":
40
+ return self.model.get_word_vector(word)
41
+
42
+ def most_similar(self, positive=None, negative=None, topn=10):
43
+ from sklearn.metrics.pairwise import cosine_similarity
44
+
45
+ if not positive:
46
+ positive = []
47
+ if not negative:
48
+ negative = []
49
+
50
+ try:
51
+ if self.model_type == "w2v":
52
+ return self.wv.most_similar(positive=positive, negative=negative, topn=topn)
53
+
54
+ elif self.model_type == "ft":
55
+ vec = np.zeros(self.vector_size)
56
+ for w in positive:
57
+ if w in self:
58
+ vec += self[w]
59
+ else:
60
+ continue
61
+ for w in negative:
62
+ if w in self:
63
+ vec -= self[w]
64
+ else:
65
+ continue
66
+
67
+ if np.allclose(vec, 0):
68
+ return []
69
+
70
+ words = list(self._words)
71
+ vectors = np.array([self[w] for w in words])
72
+
73
+ sims = cosine_similarity([vec], vectors)[0]
74
+ best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]
75
+
76
+ result = []
77
+ for i in best:
78
+ word = words[i]
79
+ if word not in positive and word not in negative:
80
+ result.append((word, float(sims[i])))
81
+ if len(result) >= topn:
82
+ break
83
+ return result
84
+
85
+ except Exception as e:
86
+ print(f"Error in most_similar: {e}")
87
+ return []
88
+
89
+ def similar_by_vector(self, vector, topn=10):
90
+ from sklearn.metrics.pairwise import cosine_similarity
91
+
92
+ words = list(self._words)
93
+ vectors = np.array([self[w] for w in words])
94
+ sims = cosine_similarity([vector], vectors)[0]
95
+ best = np.argsort(sims)[::-1][:topn]
96
+
97
+ return [(words[i], float(sims[i])) for i in best]
98
+
99
+ def get_words(self):
100
+ return list(self._words)
101
+
102
+ @property
103
+ def vectors(self):
104
+ if not hasattr(self, '_cached_vectors'):
105
+ words = list(self._words)
106
+ self._cached_words = words
107
+ self._cached_vectors = np.array([self[w] for w in words])
108
+ return self._cached_vectors
109
+
110
+ @property
111
+ def index_to_key(self):
112
+ if not hasattr(self, '_index_to_key'):
113
+ self._index_to_key = list(self._words)
114
+ return self._index_to_key
115
+
116
+
117
+ @st.cache_resource
118
+ def load_model(model_path):
119
+ try:
120
+ if model_path.endswith(".model"):
121
+ raw_model = Word2Vec.load(model_path)
122
+ current_model = UnifiedVectorModel(raw_model, model_type="w2v")
123
+
124
+ elif model_path.endswith(".bin"):
125
+ raw_model = fasttext.load_model(model_path)
126
+ current_model = UnifiedVectorModel(raw_model, model_type="ft")
127
+ else:
128
+ raise ValueError(f"wrong path format")
129
+ return current_model
130
+ except Exception as e:
131
+ st.error(f"error loading model {model_path}: {e}")
132
+ return None
133
+
134
+
135
+ MODELS_DIR = "models"
136
+
137
+ if not os.path.exists(MODELS_DIR):
138
+ st.error(f"Folder `{MODELS_DIR}` not found.")
139
+ st.stop()
140
+
141
+ model_files = []
142
+ for ext in ["*.bin", "*.model", "*.vec"]:
143
+ model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
144
+ model_files = [f for f in model_files if os.path.isfile(f)]
145
+ model_names = [os.path.basename(f) for f in model_files]
146
+
147
+ if len(model_names) == 0:
148
+ st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
149
+ st.info("Supported formats: Word2Vec (binary/text), FastText.")
150
+ st.stop()
151
+
152
+ selected_model_name = st.sidebar.selectbox(
153
+ "Choose pretrained model",
154
+ model_names
155
+ )
156
+
157
+ selected_model_path = os.path.join(MODELS_DIR, selected_model_name)
158
+
159
+ st.sidebar.info(f"loading: `{selected_model_name}`")
160
+
161
+ model = load_model(selected_model_path)
162
+
163
+ if model is None:
164
+ st.stop()
165
+ else:
166
+ st.sidebar.success(f"Model '{selected_model_name}' loaded")
167
+ st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
168
+ st.sidebar.write(f"Vector size: {model.vector_size}")
169
+
170
+ def analogy_accuracy(model, file_name):
171
+ right = 0
172
+ count = 0
173
+ results = []
174
+ with open(file_name, encoding='utf-8') as file:
175
+ for line in file:
176
+ words = line.strip().split()
177
+ if len(words) != 4:
178
+ continue
179
+ try:
180
+ most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
181
+ predicted = [x[0] for x in most_similar]
182
+ correct = words[3]
183
+ if correct in predicted:
184
+ rank = predicted.index(correct) + 1
185
+ right += 1
186
+ else:
187
+ rank = None
188
+ count += 1
189
+ results.append({
190
+ "query": f"{words[0]} - {words[1]} + {words[2]}",
191
+ "target": correct,
192
+ "predicted": predicted[0],
193
+ "rank": rank,
194
+ "in_top10": bool(rank)
195
+ })
196
+ except KeyError as e:
197
+ continue
198
+ accuracy = right / count if count > 0 else 0
199
+ return accuracy, results
200
+
201
+
202
+ def avg_similarity(model, file_name):
203
+ res = []
204
+ with open(file_name, encoding='utf-8') as file:
205
+ for line in file:
206
+ words = line.strip().split()
207
+ try:
208
+ vectors = [model[word] for word in words]
209
+ except KeyError:
210
+ continue
211
+ sims = cosine_similarity(vectors)
212
+ for i in range(len(words) - 1):
213
+ for j in range(i + 1, len(words)):
214
+ res.append(sims[i][j])
215
+ return sum(res) / len(res) if res else 0
216
+
217
+
218
+ def projection(word_vec, axis):
219
+ axis_norm = axis / np.linalg.norm(axis)
220
+ return np.dot(word_vec, axis_norm)
221
+
222
+
223
+ def get_projection_row(model, axis):
224
+ words = list(model.key_to_index.keys())
225
+ projections = [(word, projection(model[word], axis)) for word in words]
226
+ projections = sorted(projections, key=lambda x: x[1])
227
+ return projections
228
+
229
+
230
+ st.title("Vector embeddings")
231
+
232
+ tab1, tab2, tab3, tab4, tab5 = st.tabs([
233
+ "Vector ariphmetics",
234
+ "Semantic consistency",
235
+ "Semantic axis",
236
+ "Distribution analysis",
237
+ "Report"
238
+ ])
239
+
240
+ with tab1:
241
+ st.header("Vector ariphmetics")
242
+ expr = st.text_input("Insert expression", value="рубль - россия + сша")
243
+
244
+ if st.button("Compute"):
245
+ words = expr.replace('+', ' + ').replace('-', ' - ').split()
246
+ positive, negative = [], []
247
+ current = 'pos'
248
+
249
+ for w in words:
250
+ if w == '+':
251
+ current = 'pos'
252
+ elif w == '-':
253
+ current = 'neg'
254
+ else:
255
+ (positive if current == 'pos' else negative).append(w)
256
+
257
+ missing = [w for w in positive + negative if w not in model]
258
+ if missing:
259
+ st.warning(f"Words not found in voc: {', '.join(missing)}")
260
+ st.stop()
261
+
262
+ try:
263
+ similar = model.most_similar(
264
+ positive=positive,
265
+ negative=negative,
266
+ topn=10
267
+ )
268
+
269
+ st.write("### Result:")
270
+ result_words = [f"{w} ({s:.3f})" for w, s in similar]
271
+ st.write("Nearest words: " + ", ".join(result_words))
272
+
273
+ st.write("### In-between steps")
274
+
275
+ cum_vec = np.zeros(model.vector_size)
276
+
277
+ steps_data = []
278
+
279
+ for i in range(len(positive)):
280
+ cum_vec += model[w]
281
+ nearest = model.most_similar(positive=positive[:i + 1], topn=1)
282
+ steps_data.append({
283
+ "step": f"+ {positive[i]}",
284
+ "nearest word": nearest[0][0],
285
+ "similarity": nearest[0][1]
286
+ })
287
+
288
+ for i in range(len(negative)):
289
+ cum_vec -= model[w]
290
+ nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
291
+ steps_data.append({
292
+ "step": f"- {negative[i]}",
293
+ "nearest word": nearest[0][0],
294
+ "similarity": nearest[0][1]
295
+ })
296
+
297
+ df_steps = pd.DataFrame(steps_data)
298
+ st.dataframe(df_steps[["step", "nearest word", "similarity"]])
299
+
300
+ result_word = similar[0][0]
301
+ fig = px.scatter(
302
+ x=[cum_vec[0]], y=[cum_vec[1]],
303
+ text=[result_word],
304
+ title="Result (first 2 components)"
305
+ )
306
+ fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
307
+ st.plotly_chart(fig)
308
+
309
+ except Exception as e:
310
+ st.error(f"Error computing: {e}")
311
+
312
+ with tab2:
313
+ st.header("Similarity calculator")
314
+ col1, col2 = st.columns(2)
315
+ with col1:
316
+ word1 = st.text_input("word 1", value="мужчина")
317
+ with col2:
318
+ word2 = st.text_input("word 2", value="женщина")
319
+
320
+ if st.button("Compute similarity"):
321
+ try:
322
+ v1, v2 = model[word1], model[word2]
323
+ sim = cosine_similarity([v1], [v2])[0][0]
324
+ st.metric("Cosine similarity", f"{sim:.4f}")
325
+
326
+ st.write("### Nearest neighbors graph")
327
+ neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
328
+ nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
329
+ edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
330
+ [(word2, n[0]) for n in model.most_similar(word2, topn=5)]
331
+
332
+ G = go.Figure()
333
+ pos = np.random.rand(len(nodes), 2) * 2 - 1
334
+ node_x = pos[:, 0]
335
+ node_y = pos[:, 1]
336
+
337
+ for edge in edges:
338
+ x0, y0 = pos[nodes.index(edge[0])]
339
+ x1, y1 = pos[nodes.index(edge[1])]
340
+ G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))
341
+
342
+ G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
343
+ marker=dict(size=10, color='lightblue'),
344
+ text=nodes, textposition="top center"))
345
+ G.update_layout(title="Semantic links graph", showlegend=False)
346
+ st.plotly_chart(G)
347
+
348
+ except KeyError as e:
349
+ st.error(f"Word not found: {e}")
350
+
351
+ with tab3:
352
+ st.header("Semantic axis projection")
353
+ col1, col2 = st.columns(2)
354
+ with col1:
355
+ pos_axis = st.text_input("positive", value="мужчина")
356
+ with col2:
357
+ neg_axis = st.text_input("negative", value="женщина")
358
+
359
+ if st.button("Build axis"):
360
+ try:
361
+ pos_vec = model[pos_axis]
362
+ neg_vec = model[neg_axis]
363
+ axis = pos_vec - neg_vec
364
+
365
+ projections = get_projection_row(model, axis)
366
+ top_pos = projections[-10:][::-1]
367
+ top_neg = projections[:10]
368
+
369
+ st.write(f"Axis: **{pos_axis} – {neg_axis}**")
370
+ st.write("### Top 10 positive:")
371
+ st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))
372
+
373
+ st.write("### Top 10 negative:")
374
+ st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))
375
+
376
+ df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
377
+ fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}–{neg_axis}")
378
+ st.plotly_chart(fig)
379
+
380
+ except KeyError as e:
381
+ st.error(f"Error: {e}")
382
+
383
+ with tab4:
384
+ st.header("Distance distribution analysis")
385
+ all_vectors = model.vectors
386
+ sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]
387
+
388
+ dists = cosine_similarity(sample)
389
+ np.fill_diagonal(dists, 0)
390
+ flat_dists = dists.flatten()
391
+ flat_dists = flat_dists[flat_dists > 0]
392
+
393
+ fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
394
+ st.plotly_chart(fig)
395
+
396
+ st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
397
+ st.metric("Std deviation", f"{np.std(flat_dists):.3f}")
398
+
399
+ with tab5:
400
+ st.header("Report")
401
+
402
+ st.subheader("1. Analogy rate")
403
+ analogies_file = "data/analogy.txt"
404
+ if os.path.exists(analogies_file):
405
+ acc, results = analogy_accuracy(model, analogies_file)
406
+ st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
407
+ st.dataframe(pd.DataFrame(results))
408
+ else:
409
+ st.warning("File `analogy.txt` not found.")
410
+
411
+ st.subheader("2. Average synonyms similarity")
412
+ sim_file = "data/synonyms.txt"
413
+ if os.path.exists(sim_file):
414
+ avg_sim = avg_similarity(model, sim_file)
415
+ st.metric("Average similarity", f"{avg_sim:.4f}")
416
+ else:
417
+ st.warning("File `similarity_words.txt` not found.")
418
+
419
+ st.subheader("3. Average antonyms similarity")
420
+ sim_file = "data/antonyms.txt"
421
+ if os.path.exists(sim_file):
422
+ avg_sim = avg_similarity(model, sim_file)
423
+ st.metric("Average similarity", f"{avg_sim:.4f}")
424
+ else:
425
+ st.warning("File `similarity_words.txt` not found.")
426
+
427
+ st.subheader("4. Heatmap for nearest words")
428
+ query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
429
+ if st.button("Build heatmap"):
430
+ try:
431
+ vectors = [model[w] for w in query_words]
432
+ sims = cosine_similarity(vectors)
433
+ fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
434
+ st.plotly_chart(fig)
435
+ except KeyError as e:
436
+ st.error(f"Error: {e}")
437
+
438
+ st.subheader("5. 2D projection")
439
+ sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
440
+ word_list = sample_words.split()
441
+ if st.button("Show clusters"):
442
+ try:
443
+ from sklearn.manifold import TSNE
444
+ vectors = np.array([model[w] for w in word_list])
445
+ tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
446
+ embedded = tsne.fit_transform(vectors)
447
+
448
+ fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
449
+ fig.update_traces(textposition='top center')
450
+ st.plotly_chart(fig)
451
+ except KeyError as e:
452
+ st.error(f"Word not found: {e}")
models/w2v_v100_w10_m10_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c35ee5be232fce591fcf28be6fde61bd4a90d75fc7262d7303adaf0b32806b0
3
+ size 3176561
models/w2v_v100_w10_m10_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95b361b695fcb9e78d2209b1ebd5ca2e8b3b024479e7085329d2f0b6444fc2fe
3
+ size 3176559
models/w2v_v100_w10_m5_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cf858300a2a007c6a56718fb112b4320009e75d967884b2d74a9c9567dcc0c7
3
+ size 5158115
models/w2v_v100_w10_m5_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1fe1d49f365e7fd8acc11f2988e9cb8ba04061badf74337c9b13618735908d1
3
+ size 5158113
models/w2v_v100_w10_m8_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c5bd023602c41ed6c6d512953246e23f7913848db8cf8495587c780681f7b8c
3
+ size 3790868
models/w2v_v100_w10_m8_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4475e87311f026e925ab1ad9f816021f807570727f3b30256249032e4d85e7
3
+ size 3790868
models/w2v_v100_w5_m10_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3cd4410cd1063c041beed75d5e9a02c039aefe95a38157d2adc268014167584
3
+ size 3176557
models/w2v_v100_w5_m10_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6898fee7f31bb81a2c877cc3be3725eb4428ffe1650f876dc48737930aa5da1b
3
+ size 3176557
models/w2v_v100_w5_m5_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c550467186f9723753553e16288ee05e7e22b6fa55b5ede806d76c20d1badff6
3
+ size 5158113
models/w2v_v100_w5_m5_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:439758d8fe5b0efd096f70c6bf34c8f8f756d2ff3fd5be33c827b9fafe5087c6
3
+ size 5158113
models/w2v_v100_w5_m8_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a908b5b41ae2bc3cbe1b0688f9ed06422dc8e0cb3779c88a3129e0377364037
3
+ size 3790866
models/w2v_v100_w5_m8_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e698df12b684fb4bdc4ab2476ee29c07ddd3b51ca28b8b8a174e38d198f5fe5
3
+ size 3790868
models/w2v_v100_w8_m10_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a888d8a90f39928cee7539cc682a6c37d9139322d1e8d0a4f804cebba68fefd
3
+ size 3176559
models/w2v_v100_w8_m10_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fb4add03702502739b77b70d172eceba480375874898829e90e56789286c2ee
3
+ size 3176557
models/w2v_v100_w8_m5_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9543f75d5113261d16ae3b13ed1e9904613be04dee0cdfb6c7b8f65841d422d3
3
+ size 5158111
models/w2v_v100_w8_m5_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baaac6ffa83930c5f8a39ded46a61524d0afe07f37139a61e64a83e9f9a6cfca
3
+ size 5158111
models/w2v_v100_w8_m8_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64064be1788ab5e1b97c78dd0fb5159bb9ca6e2e886ab64f7e46d1c81392a7bc
3
+ size 3790866
models/w2v_v100_w8_m8_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6f366a3495c390add6716f65eb0c29c8779d2407f6e004abb3396a1254811c6
3
+ size 3790866
models/w2v_v200_w10_m10_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b83369f0f0ac975d2433d16c068e51eb1c21281eeadc2ccc99b95b73d7bb4bd8
3
+ size 6201360
models/w2v_v200_w10_m10_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a345190f56c1b26a52c9a123e6836f8be99dd5cfbc4f55779c5f9db9e38e5af
3
+ size 6201361
models/w2v_v200_w10_m5_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79ad8709c9f181ad4ae2176c9e7e7b1d6ad0fd4ec1883c5de59c4bae0fda27c
3
+ size 10071716
models/w2v_v200_w10_m5_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e1c09904ca66331a49fabfe78ef2936b579e02da4e04679e1e0d29c59e74bc2
3
+ size 10071713
models/w2v_v200_w10_m8_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b0f536e8576e94fd3c815ce8b6f969768eea4b6dce1df1df71a6ca0af1e2c04
3
+ size 7401271
models/w2v_v200_w10_m8_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bddae5328e51ee1d6a37a4b1be0a8792a4a0b86f992ee1b0c8cc56c513eee136
3
+ size 7401268
models/w2v_v200_w5_m10_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa99e8e7ea983a3044ebe686f74dd875ba948cde92d251915784c11a8faec586
3
+ size 6201359
models/w2v_v200_w5_m10_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e55bc410d00ad641b52122d2d9cb436bb10af7129c7bfa36892c043bdced086
3
+ size 6201359
models/w2v_v200_w5_m5_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46ceb9f48cf980b11561c64c6e98f82be85afdbd9b3ad5d03bcee8a1d876c149
3
+ size 10071712
models/w2v_v200_w5_m5_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:717cd37fe973d2a76f615d5a093cd42be93d081ef2cea770aae4b53a1ff44d0d
3
+ size 10071711
models/w2v_v200_w5_m8_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d4a1eb433cbad61cd37743ec5846ca06938c1ad9f9ef8b27d6ec1740dd8d401
3
+ size 7401268
models/w2v_v200_w5_m8_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71174e2974a2f12b92f1a14a13b046eb26ee9a28cd0136ec5211ed151ccf7ff5
3
+ size 7401268
models/w2v_v200_w8_m10_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e48303d6345ec48580eb29913656335842f6db4cb0e524566a997ea10a105c09
3
+ size 6201357
models/w2v_v200_w8_m10_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ec3208cc8936180db231957865d412230531c76cc0891b8acb5ac51e4208c71
3
+ size 6201357
models/w2v_v200_w8_m5_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35189d2c165137323d2fcb7b58604903f5e1236a6be432d5a7378fb1c2773cf1
3
+ size 10071714
models/w2v_v200_w8_m5_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f920ef0f10fb0b3995c1e35bb15027faae7d936ba6a54a114e5ac00a524a26
3
+ size 10071713
models/w2v_v200_w8_m8_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c865d402a86cdbb35ee3f803428c77c3cb97845c2f7307ae941a59c31bce45
3
+ size 7401269
models/w2v_v200_w8_m8_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:247bf90c6c2b9c5305a7f9453af64ad655b5a26856f3634c7a023c2c2992759c
3
+ size 7401266
models/w2v_v300_w10_m10_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69df3844797167fcdeaf8753f0f274873e17b3f78f7041a506909c4ac02d7e39
3
+ size 9226167
models/w2v_v300_w10_m10_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb1492cdd7b3b6242b6d06307437182e30e0757173783c8934985b89a347f8a6
3
+ size 9226166
models/w2v_v300_w10_m5_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a72fada5eac351a9c5787a63111d145de5043f6ad686f0307575289730921544
3
+ size 14985321
models/w2v_v300_w10_m5_sg1.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe98dac1a0f0669e48f6c8943d1380e4d8f841c304929a95825c88f8ed41facc
3
+ size 14985318
models/w2v_v300_w10_m8_sg0.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:643b9eef53b5e85f73a70b155e85a0137ccb248bb20708945e699b244e803b31
3
+ size 11011674