Spaces:

theformatisvalid
/

vectorizers_analysis

Sleeping

File size: 15,736 Bytes

import fasttext
import streamlit as st
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import os
import glob


class UnifiedVectorModel:
    def __init__(self, backend_model, model_type="w2v"):
        self.model = backend_model
        self.model_type = model_type.lower()

        if self.model_type == "w2v":
            self.wv = backend_model.wv
            self.key_to_index = self.wv.key_to_index
            self.vector_size = self.wv.vector_size
            self._words = set(self.wv.key_to_index.keys())

        elif self.model_type == "ft":
            # Для fasttext-wheel
            self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
            self.vector_size = backend_model.get_dimension()
            self._words = set(self.key_to_index.keys())
        else:
            raise ValueError("model_type must be 'w2v' or 'ft'")

    def __contains__(self, word):
        return word in self._words

    def __getitem__(self, word):
        if self.model_type == "w2v":
            return self.wv[word]
        elif self.model_type == "ft":
            return self.model.get_word_vector(word)

    def most_similar(self, positive=None, negative=None, topn=10):
        from sklearn.metrics.pairwise import cosine_similarity

        if not positive:
            positive = []
        if not negative:
            negative = []

        try:
            if self.model_type == "w2v":
                return self.wv.most_similar(positive=positive, negative=negative, topn=topn)

            elif self.model_type == "ft":
                vec = np.zeros(self.vector_size)
                for w in positive:
                    if w in self:
                        vec += self[w]
                    else:
                        continue
                for w in negative:
                    if w in self:
                        vec -= self[w]
                    else:
                        continue

                if np.allclose(vec, 0):
                    return []

                words = list(self._words)
                vectors = np.array([self[w] for w in words])

                sims = cosine_similarity([vec], vectors)[0]
                best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]

                result = []
                for i in best:
                    word = words[i]
                    if word not in positive and word not in negative:
                        result.append((word, float(sims[i])))
                        if len(result) >= topn:
                            break
                return result

        except Exception as e:
            print(f"Error in most_similar: {e}")
            return []

    def similar_by_vector(self, vector, topn=10):
        from sklearn.metrics.pairwise import cosine_similarity

        words = list(self._words)
        vectors = np.array([self[w] for w in words])
        sims = cosine_similarity([vector], vectors)[0]
        best = np.argsort(sims)[::-1][:topn]

        return [(words[i], float(sims[i])) for i in best]

    def get_words(self):
        return list(self._words)

    @property
    def vectors(self):
        if not hasattr(self, '_cached_vectors'):
            words = list(self._words)
            self._cached_words = words
            self._cached_vectors = np.array([self[w] for w in words])
        return self._cached_vectors

    @property
    def index_to_key(self):
        if not hasattr(self, '_index_to_key'):
            self._index_to_key = list(self._words)
        return self._index_to_key


@st.cache_resource
def load_model(model_path):
    try:
        if model_path.endswith(".model"):
            raw_model = Word2Vec.load(model_path)
            current_model = UnifiedVectorModel(raw_model, model_type="w2v")

        elif model_path.endswith(".bin"):
            raw_model = fasttext.load_model(model_path)
            current_model = UnifiedVectorModel(raw_model, model_type="ft")
        else:
            raise ValueError(f"wrong path format")
        return current_model
    except Exception as e:
        st.error(f"error loading model {model_path}: {e}")
        return None


MODELS_DIR = "models"

if not os.path.exists(MODELS_DIR):
    st.error(f"Folder `{MODELS_DIR}` not found.")
    st.stop()

model_files = []
for ext in ["*.bin", "*.model", "*.vec"]:
    model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
model_files = [f for f in model_files if os.path.isfile(f)]
model_names = [os.path.basename(f) for f in model_files]

if len(model_names) == 0:
    st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
    st.info("Supported formats: Word2Vec (binary/text), FastText.")
    st.stop()

selected_model_name = st.sidebar.selectbox(
    "Choose pretrained model",
    model_names
)

selected_model_path = os.path.join(MODELS_DIR, selected_model_name)

st.sidebar.info(f"loading: `{selected_model_name}`")

model = load_model(selected_model_path)

if model is None:
    st.stop()
else:
    st.sidebar.success(f"Model '{selected_model_name}' loaded")
    st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
    st.sidebar.write(f"Vector size: {model.vector_size}")

def analogy_accuracy(model, file_name):
    right = 0
    count = 0
    results = []
    with open(file_name, encoding='utf-8') as file:
        for line in file:
            words = line.strip().split()
            if len(words) != 4:
                continue
            try:
                most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
                predicted = [x[0] for x in most_similar]
                correct = words[3]
                if correct in predicted:
                    rank = predicted.index(correct) + 1
                    right += 1
                else:
                    rank = None
                count += 1
                results.append({
                    "query": f"{words[0]} - {words[1]} + {words[2]}",
                    "target": correct,
                    "predicted": predicted[0],
                    "rank": rank,
                    "in_top10": bool(rank)
                })
            except KeyError as e:
                continue
    accuracy = right / count if count > 0 else 0
    return accuracy, results


def avg_similarity(model, file_name):
    res = []
    with open(file_name, encoding='utf-8') as file:
        for line in file:
            words = line.strip().split()
            try:
                vectors = [model[word] for word in words]
            except KeyError:
                continue
            sims = cosine_similarity(vectors)
            for i in range(len(words) - 1):
                for j in range(i + 1, len(words)):
                    res.append(sims[i][j])
    return sum(res) / len(res) if res else 0


def projection(word_vec, axis):
    axis_norm = axis / np.linalg.norm(axis)
    return np.dot(word_vec, axis_norm)


def get_projection_row(model, axis):
    words = list(model.key_to_index.keys())
    projections = [(word, projection(model[word], axis)) for word in words]
    projections = sorted(projections, key=lambda x: x[1])
    return projections


st.title("Vector embeddings")

tab1, tab2, tab3, tab4, tab5 = st.tabs([
    "Vector ariphmetics",
    "Semantic consistency",
    "Semantic axis",
    "Distribution analysis",
    "Report"
])

with tab1:
    st.header("Vector ariphmetics")
    expr = st.text_input("Insert expression", value="рубль - россия + сша")

    if st.button("Compute"):
        words = expr.replace('+', ' + ').replace('-', ' - ').split()
        positive, negative = [], []
        current = 'pos'

        for w in words:
            if w == '+':
                current = 'pos'
            elif w == '-':
                current = 'neg'
            else:
                (positive if current == 'pos' else negative).append(w)

        missing = [w for w in positive + negative if w not in model]
        if missing:
            st.warning(f"Words not found in voc: {', '.join(missing)}")
            st.stop()

        try:
            similar = model.most_similar(
                positive=positive,
                negative=negative,
                topn=10
            )

            st.write("### Result:")
            result_words = [f"{w} ({s:.3f})" for w, s in similar]
            st.write("Nearest words: " + ", ".join(result_words))

            st.write("### In-between steps")

            cum_vec = np.zeros(model.vector_size)

            steps_data = []

            for i in range(len(positive)):
                cum_vec += model[w]
                nearest = model.most_similar(positive=positive[:i + 1], topn=1)
                steps_data.append({
                    "step": f"+ {positive[i]}",
                    "nearest word": nearest[0][0],
                    "similarity": nearest[0][1]
                })

            for i in range(len(negative)):
                cum_vec -= model[w]
                nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
                steps_data.append({
                    "step": f"- {negative[i]}",
                    "nearest word": nearest[0][0],
                    "similarity": nearest[0][1]
                })

            df_steps = pd.DataFrame(steps_data)
            st.dataframe(df_steps[["step", "nearest word", "similarity"]])

            result_word = similar[0][0]
            fig = px.scatter(
                x=[cum_vec[0]], y=[cum_vec[1]],
                text=[result_word],
                title="Result (first 2 components)"
            )
            fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
            st.plotly_chart(fig)

        except Exception as e:
            st.error(f"Error computing: {e}")

with tab2:
    st.header("Similarity calculator")
    col1, col2 = st.columns(2)
    with col1:
        word1 = st.text_input("word 1", value="мужчина")
    with col2:
        word2 = st.text_input("word 2", value="женщина")

    if st.button("Compute similarity"):
        try:
            v1, v2 = model[word1], model[word2]
            sim = cosine_similarity([v1], [v2])[0][0]
            st.metric("Cosine similarity", f"{sim:.4f}")

            st.write("### Nearest neighbors graph")
            neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
            nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
            edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
                    [(word2, n[0]) for n in model.most_similar(word2, topn=5)]

            G = go.Figure()
            pos = np.random.rand(len(nodes), 2) * 2 - 1
            node_x = pos[:, 0]
            node_y = pos[:, 1]

            for edge in edges:
                x0, y0 = pos[nodes.index(edge[0])]
                x1, y1 = pos[nodes.index(edge[1])]
                G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))

            G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
                                   marker=dict(size=10, color='lightblue'),
                                   text=nodes, textposition="top center"))
            G.update_layout(title="Semantic links graph", showlegend=False)
            st.plotly_chart(G)

        except KeyError as e:
            st.error(f"Word not found: {e}")

with tab3:
    st.header("Semantic axis projection")
    col1, col2 = st.columns(2)
    with col1:
        pos_axis = st.text_input("positive", value="мужчина")
    with col2:
        neg_axis = st.text_input("negative", value="женщина")

    if st.button("Build axis"):
        try:
            pos_vec = model[pos_axis]
            neg_vec = model[neg_axis]
            axis = pos_vec - neg_vec

            projections = get_projection_row(model, axis)
            top_pos = projections[-10:][::-1]
            top_neg = projections[:10]

            st.write(f"Axis: **{pos_axis} – {neg_axis}**")
            st.write("### Top 10 positive:")
            st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))

            st.write("### Top 10 negative:")
            st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))

            df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
            fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}–{neg_axis}")
            st.plotly_chart(fig)

        except KeyError as e:
            st.error(f"Error: {e}")

with tab4:
    st.header("Distance distribution analysis")
    all_vectors = model.vectors
    sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]

    dists = cosine_similarity(sample)
    np.fill_diagonal(dists, 0)
    flat_dists = dists.flatten()
    flat_dists = flat_dists[flat_dists > 0]

    fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
    st.plotly_chart(fig)

    st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
    st.metric("Std deviation", f"{np.std(flat_dists):.3f}")

with tab5:
    st.header("Report")

    st.subheader("1. Analogy rate")
    analogies_file = "data/analogy.txt"
    if os.path.exists(analogies_file):
        acc, results = analogy_accuracy(model, analogies_file)
        st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
        st.dataframe(pd.DataFrame(results))
    else:
        st.warning("File `analogy.txt` not found.")

    st.subheader("2. Average synonyms similarity")
    sim_file = "data/synonyms.txt"
    if os.path.exists(sim_file):
        avg_sim = avg_similarity(model, sim_file)
        st.metric("Average similarity", f"{avg_sim:.4f}")
    else:
        st.warning("File `similarity_words.txt` not found.")

    st.subheader("3. Average antonyms similarity")
    sim_file = "data/antonyms.txt"
    if os.path.exists(sim_file):
        avg_sim = avg_similarity(model, sim_file)
        st.metric("Average similarity", f"{avg_sim:.4f}")
    else:
        st.warning("File `similarity_words.txt` not found.")

    st.subheader("4. Heatmap for nearest words")
    query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
    if st.button("Build heatmap"):
        try:
            vectors = [model[w] for w in query_words]
            sims = cosine_similarity(vectors)
            fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
            st.plotly_chart(fig)
        except KeyError as e:
            st.error(f"Error: {e}")

    st.subheader("5. 2D projection")
    sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
    word_list = sample_words.split()
    if st.button("Show clusters"):
        try:
            from sklearn.manifold import TSNE
            vectors = np.array([model[w] for w in word_list])
            tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
            embedded = tsne.fit_transform(vectors)

            fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
            fig.update_traces(textposition='top center')
            st.plotly_chart(fig)
        except KeyError as e:
            st.error(f"Word not found: {e}")