vectorizers_analysis / src /streamlit_app.py
theformatisvalid's picture
Update src/streamlit_app.py
0b9dd2a verified
import fasttext
import streamlit as st
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import os
import glob
class UnifiedVectorModel:
def __init__(self, backend_model, model_type="w2v"):
self.model = backend_model
self.model_type = model_type.lower()
if self.model_type == "w2v":
self.wv = backend_model.wv
self.key_to_index = self.wv.key_to_index
self.vector_size = self.wv.vector_size
self._words = set(self.wv.key_to_index.keys())
elif self.model_type == "ft":
# Для fasttext-wheel
self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
self.vector_size = backend_model.get_dimension()
self._words = set(self.key_to_index.keys())
else:
raise ValueError("model_type must be 'w2v' or 'ft'")
def __contains__(self, word):
return word in self._words
def __getitem__(self, word):
if self.model_type == "w2v":
return self.wv[word]
elif self.model_type == "ft":
return self.model.get_word_vector(word)
def most_similar(self, positive=None, negative=None, topn=10):
from sklearn.metrics.pairwise import cosine_similarity
if not positive:
positive = []
if not negative:
negative = []
try:
if self.model_type == "w2v":
return self.wv.most_similar(positive=positive, negative=negative, topn=topn)
elif self.model_type == "ft":
vec = np.zeros(self.vector_size)
for w in positive:
if w in self:
vec += self[w]
else:
continue
for w in negative:
if w in self:
vec -= self[w]
else:
continue
if np.allclose(vec, 0):
return []
words = list(self._words)
vectors = np.array([self[w] for w in words])
sims = cosine_similarity([vec], vectors)[0]
best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]
result = []
for i in best:
word = words[i]
if word not in positive and word not in negative:
result.append((word, float(sims[i])))
if len(result) >= topn:
break
return result
except Exception as e:
print(f"Error in most_similar: {e}")
return []
def similar_by_vector(self, vector, topn=10):
from sklearn.metrics.pairwise import cosine_similarity
words = list(self._words)
vectors = np.array([self[w] for w in words])
sims = cosine_similarity([vector], vectors)[0]
best = np.argsort(sims)[::-1][:topn]
return [(words[i], float(sims[i])) for i in best]
def get_words(self):
return list(self._words)
@property
def vectors(self):
if not hasattr(self, '_cached_vectors'):
words = list(self._words)
self._cached_words = words
self._cached_vectors = np.array([self[w] for w in words])
return self._cached_vectors
@property
def index_to_key(self):
if not hasattr(self, '_index_to_key'):
self._index_to_key = list(self._words)
return self._index_to_key
@st.cache_resource
def load_model(model_path):
try:
if model_path.endswith(".model"):
raw_model = Word2Vec.load(model_path)
current_model = UnifiedVectorModel(raw_model, model_type="w2v")
elif model_path.endswith(".bin"):
raw_model = fasttext.load_model(model_path)
current_model = UnifiedVectorModel(raw_model, model_type="ft")
else:
raise ValueError(f"wrong path format")
return current_model
except Exception as e:
st.error(f"error loading model {model_path}: {e}")
return None
MODELS_DIR = "models"
if not os.path.exists(MODELS_DIR):
st.error(f"Folder `{MODELS_DIR}` not found.")
st.stop()
model_files = []
for ext in ["*.bin", "*.model", "*.vec"]:
model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
model_files = [f for f in model_files if os.path.isfile(f)]
model_names = [os.path.basename(f) for f in model_files]
if len(model_names) == 0:
st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
st.info("Supported formats: Word2Vec (binary/text), FastText.")
st.stop()
selected_model_name = st.sidebar.selectbox(
"Choose pretrained model",
model_names
)
selected_model_path = os.path.join(MODELS_DIR, selected_model_name)
st.sidebar.info(f"loading: `{selected_model_name}`")
model = load_model(selected_model_path)
if model is None:
st.stop()
else:
st.sidebar.success(f"Model '{selected_model_name}' loaded")
st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
st.sidebar.write(f"Vector size: {model.vector_size}")
def analogy_accuracy(model, file_name):
right = 0
count = 0
results = []
with open(file_name, encoding='utf-8') as file:
for line in file:
words = line.strip().split()
if len(words) != 4:
continue
try:
most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
predicted = [x[0] for x in most_similar]
correct = words[3]
if correct in predicted:
rank = predicted.index(correct) + 1
right += 1
else:
rank = None
count += 1
results.append({
"query": f"{words[0]} - {words[1]} + {words[2]}",
"target": correct,
"predicted": predicted[0],
"rank": rank,
"in_top10": bool(rank)
})
except KeyError as e:
continue
accuracy = right / count if count > 0 else 0
return accuracy, results
def avg_similarity(model, file_name):
res = []
with open(file_name, encoding='utf-8') as file:
for line in file:
words = line.strip().split()
try:
vectors = [model[word] for word in words]
except KeyError:
continue
sims = cosine_similarity(vectors)
for i in range(len(words) - 1):
for j in range(i + 1, len(words)):
res.append(sims[i][j])
return sum(res) / len(res) if res else 0
def projection(word_vec, axis):
axis_norm = axis / np.linalg.norm(axis)
return np.dot(word_vec, axis_norm)
def get_projection_row(model, axis):
words = list(model.key_to_index.keys())
projections = [(word, projection(model[word], axis)) for word in words]
projections = sorted(projections, key=lambda x: x[1])
return projections
st.title("Vector embeddings")
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"Vector ariphmetics",
"Semantic consistency",
"Semantic axis",
"Distribution analysis",
"Report"
])
with tab1:
st.header("Vector ariphmetics")
expr = st.text_input("Insert expression", value="рубль - россия + сша")
if st.button("Compute"):
words = expr.replace('+', ' + ').replace('-', ' - ').split()
positive, negative = [], []
current = 'pos'
for w in words:
if w == '+':
current = 'pos'
elif w == '-':
current = 'neg'
else:
(positive if current == 'pos' else negative).append(w)
missing = [w for w in positive + negative if w not in model]
if missing:
st.warning(f"Words not found in voc: {', '.join(missing)}")
st.stop()
try:
similar = model.most_similar(
positive=positive,
negative=negative,
topn=10
)
st.write("### Result:")
result_words = [f"{w} ({s:.3f})" for w, s in similar]
st.write("Nearest words: " + ", ".join(result_words))
st.write("### In-between steps")
cum_vec = np.zeros(model.vector_size)
steps_data = []
for i in range(len(positive)):
cum_vec += model[w]
nearest = model.most_similar(positive=positive[:i + 1], topn=1)
steps_data.append({
"step": f"+ {positive[i]}",
"nearest word": nearest[0][0],
"similarity": nearest[0][1]
})
for i in range(len(negative)):
cum_vec -= model[w]
nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
steps_data.append({
"step": f"- {negative[i]}",
"nearest word": nearest[0][0],
"similarity": nearest[0][1]
})
df_steps = pd.DataFrame(steps_data)
st.dataframe(df_steps[["step", "nearest word", "similarity"]])
result_word = similar[0][0]
fig = px.scatter(
x=[cum_vec[0]], y=[cum_vec[1]],
text=[result_word],
title="Result (first 2 components)"
)
fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
st.plotly_chart(fig)
except Exception as e:
st.error(f"Error computing: {e}")
with tab2:
st.header("Similarity calculator")
col1, col2 = st.columns(2)
with col1:
word1 = st.text_input("word 1", value="мужчина")
with col2:
word2 = st.text_input("word 2", value="женщина")
if st.button("Compute similarity"):
try:
v1, v2 = model[word1], model[word2]
sim = cosine_similarity([v1], [v2])[0][0]
st.metric("Cosine similarity", f"{sim:.4f}")
st.write("### Nearest neighbors graph")
neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
[(word2, n[0]) for n in model.most_similar(word2, topn=5)]
G = go.Figure()
pos = np.random.rand(len(nodes), 2) * 2 - 1
node_x = pos[:, 0]
node_y = pos[:, 1]
for edge in edges:
x0, y0 = pos[nodes.index(edge[0])]
x1, y1 = pos[nodes.index(edge[1])]
G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))
G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
marker=dict(size=10, color='lightblue'),
text=nodes, textposition="top center"))
G.update_layout(title="Semantic links graph", showlegend=False)
st.plotly_chart(G)
except KeyError as e:
st.error(f"Word not found: {e}")
with tab3:
st.header("Semantic axis projection")
col1, col2 = st.columns(2)
with col1:
pos_axis = st.text_input("positive", value="мужчина")
with col2:
neg_axis = st.text_input("negative", value="женщина")
if st.button("Build axis"):
try:
pos_vec = model[pos_axis]
neg_vec = model[neg_axis]
axis = pos_vec - neg_vec
projections = get_projection_row(model, axis)
top_pos = projections[-10:][::-1]
top_neg = projections[:10]
st.write(f"Axis: **{pos_axis}{neg_axis}**")
st.write("### Top 10 positive:")
st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))
st.write("### Top 10 negative:")
st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))
df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}{neg_axis}")
st.plotly_chart(fig)
except KeyError as e:
st.error(f"Error: {e}")
with tab4:
st.header("Distance distribution analysis")
all_vectors = model.vectors
sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]
dists = cosine_similarity(sample)
np.fill_diagonal(dists, 0)
flat_dists = dists.flatten()
flat_dists = flat_dists[flat_dists > 0]
fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
st.plotly_chart(fig)
st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
st.metric("Std deviation", f"{np.std(flat_dists):.3f}")
with tab5:
st.header("Report")
st.subheader("1. Analogy rate")
analogies_file = "data/analogy.txt"
if os.path.exists(analogies_file):
acc, results = analogy_accuracy(model, analogies_file)
st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
st.dataframe(pd.DataFrame(results))
else:
st.warning("File `analogy.txt` not found.")
st.subheader("2. Average synonyms similarity")
sim_file = "data/synonyms.txt"
if os.path.exists(sim_file):
avg_sim = avg_similarity(model, sim_file)
st.metric("Average similarity", f"{avg_sim:.4f}")
else:
st.warning("File `similarity_words.txt` not found.")
st.subheader("3. Average antonyms similarity")
sim_file = "data/antonyms.txt"
if os.path.exists(sim_file):
avg_sim = avg_similarity(model, sim_file)
st.metric("Average similarity", f"{avg_sim:.4f}")
else:
st.warning("File `similarity_words.txt` not found.")
st.subheader("4. Heatmap for nearest words")
query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
if st.button("Build heatmap"):
try:
vectors = [model[w] for w in query_words]
sims = cosine_similarity(vectors)
fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
st.plotly_chart(fig)
except KeyError as e:
st.error(f"Error: {e}")
st.subheader("5. 2D projection")
sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
word_list = sample_words.split()
if st.button("Show clusters"):
try:
from sklearn.manifold import TSNE
vectors = np.array([model[w] for w in word_list])
tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
embedded = tsne.fit_transform(vectors)
fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
fig.update_traces(textposition='top center')
st.plotly_chart(fig)
except KeyError as e:
st.error(f"Word not found: {e}")