Spaces:

theformatisvalid
/

vectorizers_analysis

Sleeping

App Files Files Community

vectorizers_analysis / src /streamlit_app.py

theformatisvalid

Update src/streamlit_app.py

0b9dd2a verified 3 months ago

raw

history blame contribute delete

15.7 kB

	import fasttext
	import streamlit as st
	import numpy as np
	import pandas as pd
	from gensim.models import Word2Vec
	from sklearn.metrics.pairwise import cosine_similarity
	import plotly.express as px
	import plotly.graph_objects as go
	from collections import Counter
	import os
	import glob


	class UnifiedVectorModel:
	def __init__(self, backend_model, model_type="w2v"):
	self.model = backend_model
	self.model_type = model_type.lower()

	if self.model_type == "w2v":
	self.wv = backend_model.wv
	self.key_to_index = self.wv.key_to_index
	self.vector_size = self.wv.vector_size
	self._words = set(self.wv.key_to_index.keys())

	elif self.model_type == "ft":
	# Для fasttext-wheel
	self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
	self.vector_size = backend_model.get_dimension()
	self._words = set(self.key_to_index.keys())
	else:
	raise ValueError("model_type must be 'w2v' or 'ft'")

	def __contains__(self, word):
	return word in self._words

	def __getitem__(self, word):
	if self.model_type == "w2v":
	return self.wv[word]
	elif self.model_type == "ft":
	return self.model.get_word_vector(word)

	def most_similar(self, positive=None, negative=None, topn=10):
	from sklearn.metrics.pairwise import cosine_similarity

	if not positive:
	positive = []
	if not negative:
	negative = []

	try:
	if self.model_type == "w2v":
	return self.wv.most_similar(positive=positive, negative=negative, topn=topn)

	elif self.model_type == "ft":
	vec = np.zeros(self.vector_size)
	for w in positive:
	if w in self:
	vec += self[w]
	else:
	continue
	for w in negative:
	if w in self:
	vec -= self[w]
	else:
	continue

	if np.allclose(vec, 0):
	return []

	words = list(self._words)
	vectors = np.array([self[w] for w in words])

	sims = cosine_similarity([vec], vectors)[0]
	best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]

	result = []
	for i in best:
	word = words[i]
	if word not in positive and word not in negative:
	result.append((word, float(sims[i])))
	if len(result) >= topn:
	break
	return result

	except Exception as e:
	print(f"Error in most_similar: {e}")
	return []

	def similar_by_vector(self, vector, topn=10):
	from sklearn.metrics.pairwise import cosine_similarity

	words = list(self._words)
	vectors = np.array([self[w] for w in words])
	sims = cosine_similarity([vector], vectors)[0]
	best = np.argsort(sims)[::-1][:topn]

	return [(words[i], float(sims[i])) for i in best]

	def get_words(self):
	return list(self._words)

	@property
	def vectors(self):
	if not hasattr(self, '_cached_vectors'):
	words = list(self._words)
	self._cached_words = words
	self._cached_vectors = np.array([self[w] for w in words])
	return self._cached_vectors

	@property
	def index_to_key(self):
	if not hasattr(self, '_index_to_key'):
	self._index_to_key = list(self._words)
	return self._index_to_key


	@st.cache_resource
	def load_model(model_path):
	try:
	if model_path.endswith(".model"):
	raw_model = Word2Vec.load(model_path)
	current_model = UnifiedVectorModel(raw_model, model_type="w2v")

	elif model_path.endswith(".bin"):
	raw_model = fasttext.load_model(model_path)
	current_model = UnifiedVectorModel(raw_model, model_type="ft")
	else:
	raise ValueError(f"wrong path format")
	return current_model
	except Exception as e:
	st.error(f"error loading model {model_path}: {e}")
	return None


	MODELS_DIR = "models"

	if not os.path.exists(MODELS_DIR):
	st.error(f"Folder `{MODELS_DIR}` not found.")
	st.stop()

	model_files = []
	for ext in [".bin", ".model", "*.vec"]:
	model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
	model_files = [f for f in model_files if os.path.isfile(f)]
	model_names = [os.path.basename(f) for f in model_files]

	if len(model_names) == 0:
	st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
	st.info("Supported formats: Word2Vec (binary/text), FastText.")
	st.stop()

	selected_model_name = st.sidebar.selectbox(
	"Choose pretrained model",
	model_names
	)

	selected_model_path = os.path.join(MODELS_DIR, selected_model_name)

	st.sidebar.info(f"loading: `{selected_model_name}`")

	model = load_model(selected_model_path)

	if model is None:
	st.stop()
	else:
	st.sidebar.success(f"Model '{selected_model_name}' loaded")
	st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
	st.sidebar.write(f"Vector size: {model.vector_size}")

	def analogy_accuracy(model, file_name):
	right = 0
	count = 0
	results = []
	with open(file_name, encoding='utf-8') as file:
	for line in file:
	words = line.strip().split()
	if len(words) != 4:
	continue
	try:
	most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
	predicted = [x[0] for x in most_similar]
	correct = words[3]
	if correct in predicted:
	rank = predicted.index(correct) + 1
	right += 1
	else:
	rank = None
	count += 1
	results.append({
	"query": f"{words[0]} - {words[1]} + {words[2]}",
	"target": correct,
	"predicted": predicted[0],
	"rank": rank,
	"in_top10": bool(rank)
	})
	except KeyError as e:
	continue
	accuracy = right / count if count > 0 else 0
	return accuracy, results


	def avg_similarity(model, file_name):
	res = []
	with open(file_name, encoding='utf-8') as file:
	for line in file:
	words = line.strip().split()
	try:
	vectors = [model[word] for word in words]
	except KeyError:
	continue
	sims = cosine_similarity(vectors)
	for i in range(len(words) - 1):
	for j in range(i + 1, len(words)):
	res.append(sims[i][j])
	return sum(res) / len(res) if res else 0


	def projection(word_vec, axis):
	axis_norm = axis / np.linalg.norm(axis)
	return np.dot(word_vec, axis_norm)


	def get_projection_row(model, axis):
	words = list(model.key_to_index.keys())
	projections = [(word, projection(model[word], axis)) for word in words]
	projections = sorted(projections, key=lambda x: x[1])
	return projections


	st.title("Vector embeddings")

	tab1, tab2, tab3, tab4, tab5 = st.tabs([
	"Vector ariphmetics",
	"Semantic consistency",
	"Semantic axis",
	"Distribution analysis",
	"Report"
	])

	with tab1:
	st.header("Vector ariphmetics")
	expr = st.text_input("Insert expression", value="рубль - россия + сша")

	if st.button("Compute"):
	words = expr.replace('+', ' + ').replace('-', ' - ').split()
	positive, negative = [], []
	current = 'pos'

	for w in words:
	if w == '+':
	current = 'pos'
	elif w == '-':
	current = 'neg'
	else:
	(positive if current == 'pos' else negative).append(w)

	missing = [w for w in positive + negative if w not in model]
	if missing:
	st.warning(f"Words not found in voc: {', '.join(missing)}")
	st.stop()

	try:
	similar = model.most_similar(
	positive=positive,
	negative=negative,
	topn=10
	)

	st.write("### Result:")
	result_words = [f"{w} ({s:.3f})" for w, s in similar]
	st.write("Nearest words: " + ", ".join(result_words))

	st.write("### In-between steps")

	cum_vec = np.zeros(model.vector_size)

	steps_data = []

	for i in range(len(positive)):
	cum_vec += model[w]
	nearest = model.most_similar(positive=positive[:i + 1], topn=1)
	steps_data.append({
	"step": f"+ {positive[i]}",
	"nearest word": nearest[0][0],
	"similarity": nearest[0][1]
	})

	for i in range(len(negative)):
	cum_vec -= model[w]
	nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
	steps_data.append({
	"step": f"- {negative[i]}",
	"nearest word": nearest[0][0],
	"similarity": nearest[0][1]
	})

	df_steps = pd.DataFrame(steps_data)
	st.dataframe(df_steps[["step", "nearest word", "similarity"]])

	result_word = similar[0][0]
	fig = px.scatter(
	x=[cum_vec[0]], y=[cum_vec[1]],
	text=[result_word],
	title="Result (first 2 components)"
	)
	fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
	st.plotly_chart(fig)

	except Exception as e:
	st.error(f"Error computing: {e}")

	with tab2:
	st.header("Similarity calculator")
	col1, col2 = st.columns(2)
	with col1:
	word1 = st.text_input("word 1", value="мужчина")
	with col2:
	word2 = st.text_input("word 2", value="женщина")

	if st.button("Compute similarity"):
	try:
	v1, v2 = model[word1], model[word2]
	sim = cosine_similarity([v1], [v2])[0][0]
	st.metric("Cosine similarity", f"{sim:.4f}")

	st.write("### Nearest neighbors graph")
	neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
	nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
	edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
	[(word2, n[0]) for n in model.most_similar(word2, topn=5)]

	G = go.Figure()
	pos = np.random.rand(len(nodes), 2) * 2 - 1
	node_x = pos[:, 0]
	node_y = pos[:, 1]

	for edge in edges:
	x0, y0 = pos[nodes.index(edge[0])]
	x1, y1 = pos[nodes.index(edge[1])]
	G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))

	G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
	marker=dict(size=10, color='lightblue'),
	text=nodes, textposition="top center"))
	G.update_layout(title="Semantic links graph", showlegend=False)
	st.plotly_chart(G)

	except KeyError as e:
	st.error(f"Word not found: {e}")

	with tab3:
	st.header("Semantic axis projection")
	col1, col2 = st.columns(2)
	with col1:
	pos_axis = st.text_input("positive", value="мужчина")
	with col2:
	neg_axis = st.text_input("negative", value="женщина")

	if st.button("Build axis"):
	try:
	pos_vec = model[pos_axis]
	neg_vec = model[neg_axis]
	axis = pos_vec - neg_vec

	projections = get_projection_row(model, axis)
	top_pos = projections[-10:][::-1]
	top_neg = projections[:10]

	st.write(f"Axis: {pos_axis} – {neg_axis}")
	st.write("### Top 10 positive:")
	st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))

	st.write("### Top 10 negative:")
	st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))

	df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
	fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}–{neg_axis}")
	st.plotly_chart(fig)

	except KeyError as e:
	st.error(f"Error: {e}")

	with tab4:
	st.header("Distance distribution analysis")
	all_vectors = model.vectors
	sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]

	dists = cosine_similarity(sample)
	np.fill_diagonal(dists, 0)
	flat_dists = dists.flatten()
	flat_dists = flat_dists[flat_dists > 0]

	fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
	st.plotly_chart(fig)

	st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
	st.metric("Std deviation", f"{np.std(flat_dists):.3f}")

	with tab5:
	st.header("Report")

	st.subheader("1. Analogy rate")
	analogies_file = "data/analogy.txt"
	if os.path.exists(analogies_file):
	acc, results = analogy_accuracy(model, analogies_file)
	st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
	st.dataframe(pd.DataFrame(results))
	else:
	st.warning("File `analogy.txt` not found.")

	st.subheader("2. Average synonyms similarity")
	sim_file = "data/synonyms.txt"
	if os.path.exists(sim_file):
	avg_sim = avg_similarity(model, sim_file)
	st.metric("Average similarity", f"{avg_sim:.4f}")
	else:
	st.warning("File `similarity_words.txt` not found.")

	st.subheader("3. Average antonyms similarity")
	sim_file = "data/antonyms.txt"
	if os.path.exists(sim_file):
	avg_sim = avg_similarity(model, sim_file)
	st.metric("Average similarity", f"{avg_sim:.4f}")
	else:
	st.warning("File `similarity_words.txt` not found.")

	st.subheader("4. Heatmap for nearest words")
	query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
	if st.button("Build heatmap"):
	try:
	vectors = [model[w] for w in query_words]
	sims = cosine_similarity(vectors)
	fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
	st.plotly_chart(fig)
	except KeyError as e:
	st.error(f"Error: {e}")

	st.subheader("5. 2D projection")
	sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
	word_list = sample_words.split()
	if st.button("Show clusters"):
	try:
	from sklearn.manifold import TSNE
	vectors = np.array([model[w] for w in word_list])
	tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
	embedded = tsne.fit_transform(vectors)

	fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
	fig.update_traces(textposition='top center')
	st.plotly_chart(fig)
	except KeyError as e:
	st.error(f"Word not found: {e}")