theformatisvalid's picture
Upload 63 files
d965fd8 verified
raw
history blame
16.2 kB
import fasttext
import streamlit as st
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import os
import glob
class UnifiedVectorModel:
def __init__(self, backend_model, model_type="w2v"):
self.model = backend_model
self.model_type = model_type.lower()
if self.model_type == "w2v":
self.wv = backend_model.wv
self.key_to_index = self.wv.key_to_index
self.vector_size = self.wv.vector_size
self._words = set(self.wv.key_to_index.keys())
elif self.model_type == "ft":
# Для fasttext-wheel
self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
self.vector_size = backend_model.get_dimension()
self._words = set(self.key_to_index.keys())
else:
raise ValueError("model_type must be 'w2v' or 'ft'")
def __contains__(self, word):
return word in self._words
def __getitem__(self, word):
if self.model_type == "w2v":
return self.wv[word]
elif self.model_type == "ft":
return self.model.get_word_vector(word)
def most_similar(self, positive=None, negative=None, topn=10):
from sklearn.metrics.pairwise import cosine_similarity
if not positive:
positive = []
if not negative:
negative = []
try:
if self.model_type == "w2v":
return self.wv.most_similar(positive=positive, negative=negative, topn=topn)
elif self.model_type == "ft":
vec = np.zeros(self.vector_size)
for w in positive:
if w in self:
vec += self[w]
else:
continue
for w in negative:
if w in self:
vec -= self[w]
else:
continue
if np.allclose(vec, 0):
return []
words = list(self._words)
vectors = np.array([self[w] for w in words])
sims = cosine_similarity([vec], vectors)[0]
best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]
result = []
for i in best:
word = words[i]
if word not in positive and word not in negative:
result.append((word, float(sims[i])))
if len(result) >= topn:
break
return result
except Exception as e:
print(f"Error in most_similar: {e}")
return []
def similar_by_vector(self, vector, topn=10):
from sklearn.metrics.pairwise import cosine_similarity
words = list(self._words)
vectors = np.array([self[w] for w in words])
sims = cosine_similarity([vector], vectors)[0]
best = np.argsort(sims)[::-1][:topn]
return [(words[i], float(sims[i])) for i in best]
def get_words(self):
return list(self._words)
@property
def vectors(self):
if not hasattr(self, '_cached_vectors'):
words = list(self._words)
self._cached_words = words
self._cached_vectors = np.array([self[w] for w in words])
return self._cached_vectors
@property
def index_to_key(self):
if not hasattr(self, '_index_to_key'):
self._index_to_key = list(self._words)
return self._index_to_key
@st.cache_resource
def load_model(model_path):
try:
if model_path.endswith(".model"):
raw_model = Word2Vec.load(model_path)
current_model = UnifiedVectorModel(raw_model, model_type="w2v")
elif model_path.endswith(".bin"):
raw_model = fasttext.load_model(model_path)
current_model = UnifiedVectorModel(raw_model, model_type="ft")
else:
raise ValueError(f"wrong path format")
return current_model
except Exception as e:
st.error(f"error loading model {model_path}: {e}")
return None
MODELS_DIR = "models"
if not os.path.exists(MODELS_DIR):
st.error(f"Folder `{MODELS_DIR}` not found.")
st.stop()
model_files = []
for ext in ["*.bin", "*.model", "*.vec"]:
model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
model_files = [f for f in model_files if os.path.isfile(f)]
model_names = [os.path.basename(f) for f in model_files]
if len(model_names) == 0:
st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
st.info("Supported formats: Word2Vec (binary/text), FastText.")
st.stop()
selected_model_name = st.sidebar.selectbox(
"Choose pretrained model",
model_names
)
selected_model_path = os.path.join(MODELS_DIR, selected_model_name)
st.sidebar.info(f"loading: `{selected_model_name}`")
model = load_model(selected_model_path)
if model is None:
st.stop()
else:
st.sidebar.success(f"Model '{selected_model_name}' loaded")
st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
st.sidebar.write(f"Vector size: {model.vector_size}")
def analogy_accuracy(model, file_name):
right = 0
count = 0
results = []
with open(file_name, encoding='utf-8') as file:
for line in file:
words = line.strip().split()
if len(words) != 4:
continue
try:
most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
predicted = [x[0] for x in most_similar]
correct = words[3]
if correct in predicted:
rank = predicted.index(correct) + 1
right += 1
else:
rank = None
count += 1
results.append({
"query": f"{words[0]} - {words[1]} + {words[2]}",
"target": correct,
"predicted": predicted[0],
"rank": rank,
"in_top10": bool(rank)
})
except KeyError as e:
continue
accuracy = right / count if count > 0 else 0
return accuracy, results
def avg_similarity(model, file_name):
res = []
with open(file_name, encoding='utf-8') as file:
for line in file:
words = line.strip().split()
try:
vectors = [model[word] for word in words]
except KeyError:
continue
sims = cosine_similarity(vectors)
for i in range(len(words) - 1):
for j in range(i + 1, len(words)):
res.append(sims[i][j])
return sum(res) / len(res) if res else 0
def projection(word_vec, axis):
axis_norm = axis / np.linalg.norm(axis)
return np.dot(word_vec, axis_norm)
def get_projection_row(model, axis):
words = list(model.key_to_index.keys())
projections = [(word, projection(model[word], axis)) for word in words]
projections = sorted(projections, key=lambda x: x[1])
return projections
st.title("Vector embeddings")
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"Vector ariphmetics",
"Semantic consistency",
"Semantic axis",
"Distribution analysis",
"Report"
])
with tab1:
st.header("Vector ariphmetics")
expr = st.text_input("Insert expression", value="рубль - россия + сша")
if st.button("Compute"):
words = expr.replace('+', ' + ').replace('-', ' - ').split()
positive, negative = [], []
current = 'pos'
for w in words:
if w == '+':
current = 'pos'
elif w == '-':
current = 'neg'
else:
(positive if current == 'pos' else negative).append(w)
missing = [w for w in positive + negative if w not in model]
if missing:
st.warning(f"Words not found in voc: {', '.join(missing)}")
st.stop()
try:
similar = model.most_similar(
positive=positive,
negative=negative,
topn=10
)
st.write("### Result:")
result_words = [f"{w} ({s:.3f})" for w, s in similar]
st.write("Nearest words: " + ", ".join(result_words))
st.write("### In-between steps")
cum_vec = np.zeros(model.vector_size)
steps_data = []
for i in range(len(positive)):
cum_vec += model[w]
nearest = model.most_similar(positive=positive[:i + 1], topn=1)
steps_data.append({
"step": f"+ {positive[i]}",
"nearest word": nearest[0][0],
"similarity": nearest[0][1]
})
for i in range(len(negative)):
cum_vec -= model[w]
nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
steps_data.append({
"step": f"- {negative[i]}",
"nearest word": nearest[0][0],
"similarity": nearest[0][1]
})
df_steps = pd.DataFrame(steps_data)
st.dataframe(df_steps[["step", "nearest word", "similarity"]])
result_word = similar[0][0]
fig = px.scatter(
x=[cum_vec[0]], y=[cum_vec[1]],
text=[result_word],
title="Result (first 2 components)"
)
fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
st.plotly_chart(fig)
except Exception as e:
st.error(f"Error computing: {e}")
with tab2:
st.header("Similarity calculator")
col1, col2 = st.columns(2)
with col1:
word1 = st.text_input("word 1", value="мужчина")
with col2:
word2 = st.text_input("word 2", value="женщина")
if st.button("Compute similarity"):
try:
v1, v2 = model[word1], model[word2]
sim = cosine_similarity([v1], [v2])[0][0]
st.metric("Cosine similarity", f"{sim:.4f}")
st.write("### Nearest neighbors graph")
neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
[(word2, n[0]) for n in model.most_similar(word2, topn=5)]
G = go.Figure()
pos = np.random.rand(len(nodes), 2) * 2 - 1
node_x = pos[:, 0]
node_y = pos[:, 1]
for edge in edges:
x0, y0 = pos[nodes.index(edge[0])]
x1, y1 = pos[nodes.index(edge[1])]
G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))
G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
marker=dict(size=10, color='lightblue'),
text=nodes, textposition="top center"))
G.update_layout(title="Semantic links graph", showlegend=False)
st.plotly_chart(G)
except KeyError as e:
st.error(f"Word not found: {e}")
with tab3:
st.header("Semantic axis projection")
col1, col2 = st.columns(2)
with col1:
pos_axis = st.text_input("positive", value="мужчина")
with col2:
neg_axis = st.text_input("negative", value="женщина")
if st.button("Build axis"):
try:
pos_vec = model[pos_axis]
neg_vec = model[neg_axis]
axis = pos_vec - neg_vec
projections = get_projection_row(model, axis)
top_pos = projections[-10:][::-1]
top_neg = projections[:10]
st.write(f"Axis: **{pos_axis}{neg_axis}**")
st.write("### Top 10 positive:")
st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))
st.write("### Top 10 negative:")
st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))
df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}{neg_axis}")
st.plotly_chart(fig)
except KeyError as e:
st.error(f"Error: {e}")
with tab4:
st.header("Distance distribution analysis")
all_vectors = model.vectors
sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]
dists = cosine_similarity(sample)
np.fill_diagonal(dists, 0)
flat_dists = dists.flatten()
flat_dists = flat_dists[flat_dists > 0]
fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
st.plotly_chart(fig)
st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
st.metric("Std deviation", f"{np.std(flat_dists):.3f}")
with tab5:
st.header("Report")
st.subheader("1. Analogy rate")
analogies_file = "data/analogy.txt"
if os.path.exists(analogies_file):
acc, results = analogy_accuracy(model, analogies_file)
st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
st.dataframe(pd.DataFrame(results))
else:
st.warning("File `analogy.txt` not found.")
st.subheader("2. Average synonyms similarity")
sim_file = "data/synonyms.txt"
if os.path.exists(sim_file):
avg_sim = avg_similarity(model, sim_file)
st.metric("Average similarity", f"{avg_sim:.4f}")
else:
st.warning("File `similarity_words.txt` not found.")
st.subheader("3. Average antonyms similarity")
sim_file = "data/antonyms.txt"
if os.path.exists(sim_file):
avg_sim = avg_similarity(model, sim_file)
st.metric("Average similarity", f"{avg_sim:.4f}")
else:
st.warning("File `similarity_words.txt` not found.")
st.subheader("4. Heatmap for nearest words")
query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
if st.button("Build heatmap"):
try:
vectors = [model[w] for w in query_words]
sims = cosine_similarity(vectors)
fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
st.plotly_chart(fig)
except KeyError as e:
st.error(f"Error: {e}")
st.subheader("5. 2D projection")
sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
word_list = sample_words.split()
if st.button("Show clusters"):
try:
from sklearn.manifold import TSNE
vectors = np.array([model[w] for w in word_list])
tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
embedded = tsne.fit_transform(vectors)
fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
fig.update_traces(textposition='top center')
st.plotly_chart(fig)
except KeyError as e:
st.error(f"Word not found: {e}")