Spaces:

theformatisvalid
/

vectorizers_analysis

Sleeping

App Files Files Community

vectorizers_analysis / main.py

theformatisvalid

Upload 63 files

d965fd8 verified 4 months ago

raw

history blame

16.2 kB

	import fasttext
	import streamlit as st
	import numpy as np
	import pandas as pd
	from gensim.models import Word2Vec
	from sklearn.metrics.pairwise import cosine_similarity
	import plotly.express as px
	import plotly.graph_objects as go
	from collections import Counter
	import os
	import glob


	class UnifiedVectorModel:
	def __init__(self, backend_model, model_type="w2v"):
	self.model = backend_model
	self.model_type = model_type.lower()

	if self.model_type == "w2v":
	self.wv = backend_model.wv
	self.key_to_index = self.wv.key_to_index
	self.vector_size = self.wv.vector_size
	self._words = set(self.wv.key_to_index.keys())

	elif self.model_type == "ft":
	# Для fasttext-wheel
	self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
	self.vector_size = backend_model.get_dimension()
	self._words = set(self.key_to_index.keys())
	else:
	raise ValueError("model_type must be 'w2v' or 'ft'")

	def __contains__(self, word):
	return word in self._words

	def __getitem__(self, word):
	if self.model_type == "w2v":
	return self.wv[word]
	elif self.model_type == "ft":
	return self.model.get_word_vector(word)

	def most_similar(self, positive=None, negative=None, topn=10):
	from sklearn.metrics.pairwise import cosine_similarity

	if not positive:
	positive = []
	if not negative:
	negative = []

	try:
	if self.model_type == "w2v":
	return self.wv.most_similar(positive=positive, negative=negative, topn=topn)

	elif self.model_type == "ft":
	vec = np.zeros(self.vector_size)
	for w in positive:
	if w in self:
	vec += self[w]
	else:
	continue
	for w in negative:
	if w in self:
	vec -= self[w]
	else:
	continue

	if np.allclose(vec, 0):
	return []

	words = list(self._words)
	vectors = np.array([self[w] for w in words])

	sims = cosine_similarity([vec], vectors)[0]
	best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]

	result = []
	for i in best:
	word = words[i]
	if word not in positive and word not in negative:
	result.append((word, float(sims[i])))
	if len(result) >= topn:
	break
	return result

	except Exception as e:
	print(f"Error in most_similar: {e}")
	return []

	def similar_by_vector(self, vector, topn=10):
	from sklearn.metrics.pairwise import cosine_similarity

	words = list(self._words)
	vectors = np.array([self[w] for w in words])
	sims = cosine_similarity([vector], vectors)[0]
	best = np.argsort(sims)[::-1][:topn]

	return [(words[i], float(sims[i])) for i in best]

	def get_words(self):
	return list(self._words)

	@property
	def vectors(self):
	if not hasattr(self, '_cached_vectors'):
	words = list(self._words)
	self._cached_words = words
	self._cached_vectors = np.array([self[w] for w in words])
	return self._cached_vectors

	@property
	def index_to_key(self):
	if not hasattr(self, '_index_to_key'):
	self._index_to_key = list(self._words)
	return self._index_to_key


	@st.cache_resource
	def load_model(model_path):
	try:
	if model_path.endswith(".model"):
	raw_model = Word2Vec.load(model_path)
	current_model = UnifiedVectorModel(raw_model, model_type="w2v")

	elif model_path.endswith(".bin"):
	raw_model = fasttext.load_model(model_path)
	current_model = UnifiedVectorModel(raw_model, model_type="ft")
	else:
	raise ValueError(f"wrong path format")
	return current_model
	except Exception as e:
	st.error(f"error loading model {model_path}: {e}")
	return None


	MODELS_DIR = "models"

	if not os.path.exists(MODELS_DIR):
	st.error(f"Folder `{MODELS_DIR}` not found.")
	st.stop()

	model_files = []
	for ext in [".bin", ".model", "*.vec"]:
	model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
	model_files = [f for f in model_files if os.path.isfile(f)]
	model_names = [os.path.basename(f) for f in model_files]

	if len(model_names) == 0:
	st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
	st.info("Supported formats: Word2Vec (binary/text), FastText.")
	st.stop()

	selected_model_name = st.sidebar.selectbox(
	"Choose pretrained model",
	model_names
	)

	selected_model_path = os.path.join(MODELS_DIR, selected_model_name)

	st.sidebar.info(f"loading: `{selected_model_name}`")

	model = load_model(selected_model_path)

	if model is None:
	st.stop()
	else:
	st.sidebar.success(f"Model '{selected_model_name}' loaded")
	st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
	st.sidebar.write(f"Vector size: {model.vector_size}")

	def analogy_accuracy(model, file_name):
	right = 0
	count = 0
	results = []
	with open(file_name, encoding='utf-8') as file:
	for line in file:
	words = line.strip().split()
	if len(words) != 4:
	continue
	try:
	most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
	predicted = [x[0] for x in most_similar]
	correct = words[3]
	if correct in predicted:
	rank = predicted.index(correct) + 1
	right += 1
	else:
	rank = None
	count += 1
	results.append({
	"query": f"{words[0]} - {words[1]} + {words[2]}",
	"target": correct,
	"predicted": predicted[0],
	"rank": rank,
	"in_top10": bool(rank)
	})
	except KeyError as e:
	continue
	accuracy = right / count if count > 0 else 0
	return accuracy, results


	def avg_similarity(model, file_name):
	res = []
	with open(file_name, encoding='utf-8') as file:
	for line in file:
	words = line.strip().split()
	try:
	vectors = [model[word] for word in words]
	except KeyError:
	continue
	sims = cosine_similarity(vectors)
	for i in range(len(words) - 1):
	for j in range(i + 1, len(words)):
	res.append(sims[i][j])
	return sum(res) / len(res) if res else 0


	def projection(word_vec, axis):
	axis_norm = axis / np.linalg.norm(axis)
	return np.dot(word_vec, axis_norm)


	def get_projection_row(model, axis):
	words = list(model.key_to_index.keys())
	projections = [(word, projection(model[word], axis)) for word in words]
	projections = sorted(projections, key=lambda x: x[1])
	return projections


	st.title("Vector embeddings")

	tab1, tab2, tab3, tab4, tab5 = st.tabs([
	"Vector ariphmetics",
	"Semantic consistency",
	"Semantic axis",
	"Distribution analysis",
	"Report"
	])

	with tab1:
	st.header("Vector ariphmetics")
	expr = st.text_input("Insert expression", value="рубль - россия + сша")

	if st.button("Compute"):
	words = expr.replace('+', ' + ').replace('-', ' - ').split()
	positive, negative = [], []
	current = 'pos'

	for w in words:
	if w == '+':
	current = 'pos'
	elif w == '-':
	current = 'neg'
	else:
	(positive if current == 'pos' else negative).append(w)

	missing = [w for w in positive + negative if w not in model]
	if missing:
	st.warning(f"Words not found in voc: {', '.join(missing)}")
	st.stop()

	try:
	similar = model.most_similar(
	positive=positive,
	negative=negative,
	topn=10
	)

	st.write("### Result:")
	result_words = [f"{w} ({s:.3f})" for w, s in similar]
	st.write("Nearest words: " + ", ".join(result_words))

	st.write("### In-between steps")

	cum_vec = np.zeros(model.vector_size)

	steps_data = []

	for i in range(len(positive)):
	cum_vec += model[w]
	nearest = model.most_similar(positive=positive[:i + 1], topn=1)
	steps_data.append({
	"step": f"+ {positive[i]}",
	"nearest word": nearest[0][0],
	"similarity": nearest[0][1]
	})

	for i in range(len(negative)):
	cum_vec -= model[w]
	nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
	steps_data.append({
	"step": f"- {negative[i]}",
	"nearest word": nearest[0][0],
	"similarity": nearest[0][1]
	})

	df_steps = pd.DataFrame(steps_data)
	st.dataframe(df_steps[["step", "nearest word", "similarity"]])

	result_word = similar[0][0]
	fig = px.scatter(
	x=[cum_vec[0]], y=[cum_vec[1]],
	text=[result_word],
	title="Result (first 2 components)"
	)
	fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
	st.plotly_chart(fig)

	except Exception as e:
	st.error(f"Error computing: {e}")

	with tab2:
	st.header("Similarity calculator")
	col1, col2 = st.columns(2)
	with col1:
	word1 = st.text_input("word 1", value="мужчина")
	with col2:
	word2 = st.text_input("word 2", value="женщина")

	if st.button("Compute similarity"):
	try:
	v1, v2 = model[word1], model[word2]
	sim = cosine_similarity([v1], [v2])[0][0]
	st.metric("Cosine similarity", f"{sim:.4f}")

	st.write("### Nearest neighbors graph")
	neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
	nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
	edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
	[(word2, n[0]) for n in model.most_similar(word2, topn=5)]

	G = go.Figure()
	pos = np.random.rand(len(nodes), 2) * 2 - 1
	node_x = pos[:, 0]
	node_y = pos[:, 1]

	for edge in edges:
	x0, y0 = pos[nodes.index(edge[0])]
	x1, y1 = pos[nodes.index(edge[1])]
	G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))

	G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
	marker=dict(size=10, color='lightblue'),
	text=nodes, textposition="top center"))
	G.update_layout(title="Semantic links graph", showlegend=False)
	st.plotly_chart(G)

	except KeyError as e:
	st.error(f"Word not found: {e}")

	with tab3:
	st.header("Semantic axis projection")
	col1, col2 = st.columns(2)
	with col1:
	pos_axis = st.text_input("positive", value="мужчина")
	with col2:
	neg_axis = st.text_input("negative", value="женщина")

	if st.button("Build axis"):
	try:
	pos_vec = model[pos_axis]
	neg_vec = model[neg_axis]
	axis = pos_vec - neg_vec

	projections = get_projection_row(model, axis)
	top_pos = projections[-10:][::-1]
	top_neg = projections[:10]

	st.write(f"Axis: {pos_axis} – {neg_axis}")
	st.write("### Top 10 positive:")
	st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))

	st.write("### Top 10 negative:")
	st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))

	df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
	fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}–{neg_axis}")
	st.plotly_chart(fig)

	except KeyError as e:
	st.error(f"Error: {e}")

	with tab4:
	st.header("Distance distribution analysis")
	all_vectors = model.vectors
	sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]

	dists = cosine_similarity(sample)
	np.fill_diagonal(dists, 0)
	flat_dists = dists.flatten()
	flat_dists = flat_dists[flat_dists > 0]

	fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
	st.plotly_chart(fig)

	st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
	st.metric("Std deviation", f"{np.std(flat_dists):.3f}")

	with tab5:
	st.header("Report")

	st.subheader("1. Analogy rate")
	analogies_file = "data/analogy.txt"
	if os.path.exists(analogies_file):
	acc, results = analogy_accuracy(model, analogies_file)
	st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
	st.dataframe(pd.DataFrame(results))
	else:
	st.warning("File `analogy.txt` not found.")

	st.subheader("2. Average synonyms similarity")
	sim_file = "data/synonyms.txt"
	if os.path.exists(sim_file):
	avg_sim = avg_similarity(model, sim_file)
	st.metric("Average similarity", f"{avg_sim:.4f}")
	else:
	st.warning("File `similarity_words.txt` not found.")

	st.subheader("3. Average antonyms similarity")
	sim_file = "data/antonyms.txt"
	if os.path.exists(sim_file):
	avg_sim = avg_similarity(model, sim_file)
	st.metric("Average similarity", f"{avg_sim:.4f}")
	else:
	st.warning("File `similarity_words.txt` not found.")

	st.subheader("4. Heatmap for nearest words")
	query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
	if st.button("Build heatmap"):
	try:
	vectors = [model[w] for w in query_words]
	sims = cosine_similarity(vectors)
	fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
	st.plotly_chart(fig)
	except KeyError as e:
	st.error(f"Error: {e}")

	st.subheader("5. 2D projection")
	sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
	word_list = sample_words.split()
	if st.button("Show clusters"):
	try:
	from sklearn.manifold import TSNE
	vectors = np.array([model[w] for w in word_list])
	tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
	embedded = tsne.fit_transform(vectors)

	fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
	fig.update_traces(textposition='top center')
	st.plotly_chart(fig)
	except KeyError as e:
	st.error(f"Word not found: {e}")