Spaces:

theformatisvalid
/

vectorizers_analysis

Sleeping

App Files Files Community

theformatisvalid commited on Nov 8, 2025

Commit

0b9dd2a

verified ·

1 Parent(s): 084e1b8

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +449 -37

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,452 @@
-import altair as alt
 import numpy as np
 import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import fasttext
+import streamlit as st
 import numpy as np
 import pandas as pd
+from gensim.models import Word2Vec
+from sklearn.metrics.pairwise import cosine_similarity
+import plotly.express as px
+import plotly.graph_objects as go
+from collections import Counter
+import os
+import glob
+class UnifiedVectorModel:
+    def __init__(self, backend_model, model_type="w2v"):
+        self.model = backend_model
+        self.model_type = model_type.lower()
+        if self.model_type == "w2v":
+            self.wv = backend_model.wv
+            self.key_to_index = self.wv.key_to_index
+            self.vector_size = self.wv.vector_size
+            self._words = set(self.wv.key_to_index.keys())
+        elif self.model_type == "ft":
+            # Для fasttext-wheel
+            self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
+            self.vector_size = backend_model.get_dimension()
+            self._words = set(self.key_to_index.keys())
+        else:
+            raise ValueError("model_type must be 'w2v' or 'ft'")
+    def __contains__(self, word):
+        return word in self._words
+    def __getitem__(self, word):
+        if self.model_type == "w2v":
+            return self.wv[word]
+        elif self.model_type == "ft":
+            return self.model.get_word_vector(word)
+    def most_similar(self, positive=None, negative=None, topn=10):
+        from sklearn.metrics.pairwise import cosine_similarity
+        if not positive:
+            positive = []
+        if not negative:
+            negative = []
+        try:
+            if self.model_type == "w2v":
+                return self.wv.most_similar(positive=positive, negative=negative, topn=topn)
+            elif self.model_type == "ft":
+                vec = np.zeros(self.vector_size)
+                for w in positive:
+                    if w in self:
+                        vec += self[w]
+                    else:
+                        continue
+                for w in negative:
+                    if w in self:
+                        vec -= self[w]
+                    else:
+                        continue
+                if np.allclose(vec, 0):
+                    return []
+                words = list(self._words)
+                vectors = np.array([self[w] for w in words])
+                sims = cosine_similarity([vec], vectors)[0]
+                best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]
+                result = []
+                for i in best:
+                    word = words[i]
+                    if word not in positive and word not in negative:
+                        result.append((word, float(sims[i])))
+                        if len(result) >= topn:
+                            break
+                return result
+        except Exception as e:
+            print(f"Error in most_similar: {e}")
+            return []
+    def similar_by_vector(self, vector, topn=10):
+        from sklearn.metrics.pairwise import cosine_similarity
+        words = list(self._words)
+        vectors = np.array([self[w] for w in words])
+        sims = cosine_similarity([vector], vectors)[0]
+        best = np.argsort(sims)[::-1][:topn]
+        return [(words[i], float(sims[i])) for i in best]
+    def get_words(self):
+        return list(self._words)
+    @property
+    def vectors(self):
+        if not hasattr(self, '_cached_vectors'):
+            words = list(self._words)
+            self._cached_words = words
+            self._cached_vectors = np.array([self[w] for w in words])
+        return self._cached_vectors
+    @property
+    def index_to_key(self):
+        if not hasattr(self, '_index_to_key'):
+            self._index_to_key = list(self._words)
+        return self._index_to_key
+@st.cache_resource
+def load_model(model_path):
+    try:
+        if model_path.endswith(".model"):
+            raw_model = Word2Vec.load(model_path)
+            current_model = UnifiedVectorModel(raw_model, model_type="w2v")
+        elif model_path.endswith(".bin"):
+            raw_model = fasttext.load_model(model_path)
+            current_model = UnifiedVectorModel(raw_model, model_type="ft")
+        else:
+            raise ValueError(f"wrong path format")
+        return current_model
+    except Exception as e:
+        st.error(f"error loading model {model_path}: {e}")
+        return None
+MODELS_DIR = "models"
+if not os.path.exists(MODELS_DIR):
+    st.error(f"Folder `{MODELS_DIR}` not found.")
+    st.stop()
+model_files = []
+for ext in ["*.bin", "*.model", "*.vec"]:
+    model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
+model_files = [f for f in model_files if os.path.isfile(f)]
+model_names = [os.path.basename(f) for f in model_files]
+if len(model_names) == 0:
+    st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
+    st.info("Supported formats: Word2Vec (binary/text), FastText.")
+    st.stop()
+selected_model_name = st.sidebar.selectbox(
+    "Choose pretrained model",
+    model_names
+)
+selected_model_path = os.path.join(MODELS_DIR, selected_model_name)
+st.sidebar.info(f"loading: `{selected_model_name}`")
+model = load_model(selected_model_path)
+if model is None:
+    st.stop()
+else:
+    st.sidebar.success(f"Model '{selected_model_name}' loaded")
+    st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
+    st.sidebar.write(f"Vector size: {model.vector_size}")
+def analogy_accuracy(model, file_name):
+    right = 0
+    count = 0
+    results = []
+    with open(file_name, encoding='utf-8') as file:
+        for line in file:
+            words = line.strip().split()
+            if len(words) != 4:
+                continue
+            try:
+                most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
+                predicted = [x[0] for x in most_similar]
+                correct = words[3]
+                if correct in predicted:
+                    rank = predicted.index(correct) + 1
+                    right += 1
+                else:
+                    rank = None
+                count += 1
+                results.append({
+                    "query": f"{words[0]} - {words[1]} + {words[2]}",
+                    "target": correct,
+                    "predicted": predicted[0],
+                    "rank": rank,
+                    "in_top10": bool(rank)
+                })
+            except KeyError as e:
+                continue
+    accuracy = right / count if count > 0 else 0
+    return accuracy, results
+def avg_similarity(model, file_name):
+    res = []
+    with open(file_name, encoding='utf-8') as file:
+        for line in file:
+            words = line.strip().split()
+            try:
+                vectors = [model[word] for word in words]
+            except KeyError:
+                continue
+            sims = cosine_similarity(vectors)
+            for i in range(len(words) - 1):
+                for j in range(i + 1, len(words)):
+                    res.append(sims[i][j])
+    return sum(res) / len(res) if res else 0
+def projection(word_vec, axis):
+    axis_norm = axis / np.linalg.norm(axis)
+    return np.dot(word_vec, axis_norm)
+def get_projection_row(model, axis):
+    words = list(model.key_to_index.keys())
+    projections = [(word, projection(model[word], axis)) for word in words]
+    projections = sorted(projections, key=lambda x: x[1])
+    return projections
+st.title("Vector embeddings")
+tab1, tab2, tab3, tab4, tab5 = st.tabs([
+    "Vector ariphmetics",
+    "Semantic consistency",
+    "Semantic axis",
+    "Distribution analysis",
+    "Report"
+])
+with tab1:
+    st.header("Vector ariphmetics")
+    expr = st.text_input("Insert expression", value="рубль - россия + сша")
+    if st.button("Compute"):
+        words = expr.replace('+', ' + ').replace('-', ' - ').split()
+        positive, negative = [], []
+        current = 'pos'
+        for w in words:
+            if w == '+':
+                current = 'pos'
+            elif w == '-':
+                current = 'neg'
+            else:
+                (positive if current == 'pos' else negative).append(w)
+        missing = [w for w in positive + negative if w not in model]
+        if missing:
+            st.warning(f"Words not found in voc: {', '.join(missing)}")
+            st.stop()
+        try:
+            similar = model.most_similar(
+                positive=positive,
+                negative=negative,
+                topn=10
+            )
+            st.write("### Result:")
+            result_words = [f"{w} ({s:.3f})" for w, s in similar]
+            st.write("Nearest words: " + ", ".join(result_words))
+            st.write("### In-between steps")
+            cum_vec = np.zeros(model.vector_size)
+            steps_data = []
+            for i in range(len(positive)):
+                cum_vec += model[w]
+                nearest = model.most_similar(positive=positive[:i + 1], topn=1)
+                steps_data.append({
+                    "step": f"+ {positive[i]}",
+                    "nearest word": nearest[0][0],
+                    "similarity": nearest[0][1]
+                })
+            for i in range(len(negative)):
+                cum_vec -= model[w]
+                nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
+                steps_data.append({
+                    "step": f"- {negative[i]}",
+                    "nearest word": nearest[0][0],
+                    "similarity": nearest[0][1]
+                })
+            df_steps = pd.DataFrame(steps_data)
+            st.dataframe(df_steps[["step", "nearest word", "similarity"]])
+            result_word = similar[0][0]
+            fig = px.scatter(
+                x=[cum_vec[0]], y=[cum_vec[1]],
+                text=[result_word],
+                title="Result (first 2 components)"
+            )
+            fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
+            st.plotly_chart(fig)
+        except Exception as e:
+            st.error(f"Error computing: {e}")
+with tab2:
+    st.header("Similarity calculator")
+    col1, col2 = st.columns(2)
+    with col1:
+        word1 = st.text_input("word 1", value="мужчина")
+    with col2:
+        word2 = st.text_input("word 2", value="женщина")
+    if st.button("Compute similarity"):
+        try:
+            v1, v2 = model[word1], model[word2]
+            sim = cosine_similarity([v1], [v2])[0][0]
+            st.metric("Cosine similarity", f"{sim:.4f}")
+            st.write("### Nearest neighbors graph")
+            neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
+            nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
+            edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
+                    [(word2, n[0]) for n in model.most_similar(word2, topn=5)]
+            G = go.Figure()
+            pos = np.random.rand(len(nodes), 2) * 2 - 1
+            node_x = pos[:, 0]
+            node_y = pos[:, 1]
+            for edge in edges:
+                x0, y0 = pos[nodes.index(edge[0])]
+                x1, y1 = pos[nodes.index(edge[1])]
+                G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))
+            G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
+                                   marker=dict(size=10, color='lightblue'),
+                                   text=nodes, textposition="top center"))
+            G.update_layout(title="Semantic links graph", showlegend=False)
+            st.plotly_chart(G)
+        except KeyError as e:
+            st.error(f"Word not found: {e}")
+with tab3:
+    st.header("Semantic axis projection")
+    col1, col2 = st.columns(2)
+    with col1:
+        pos_axis = st.text_input("positive", value="мужчина")
+    with col2:
+        neg_axis = st.text_input("negative", value="женщина")
+    if st.button("Build axis"):
+        try:
+            pos_vec = model[pos_axis]
+            neg_vec = model[neg_axis]
+            axis = pos_vec - neg_vec
+            projections = get_projection_row(model, axis)
+            top_pos = projections[-10:][::-1]
+            top_neg = projections[:10]
+            st.write(f"Axis: **{pos_axis} – {neg_axis}**")
+            st.write("### Top 10 positive:")
+            st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))
+            st.write("### Top 10 negative:")
+            st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))
+            df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
+            fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}–{neg_axis}")
+            st.plotly_chart(fig)
+        except KeyError as e:
+            st.error(f"Error: {e}")
+with tab4:
+    st.header("Distance distribution analysis")
+    all_vectors = model.vectors
+    sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]
+    dists = cosine_similarity(sample)
+    np.fill_diagonal(dists, 0)
+    flat_dists = dists.flatten()
+    flat_dists = flat_dists[flat_dists > 0]
+    fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
+    st.plotly_chart(fig)
+    st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
+    st.metric("Std deviation", f"{np.std(flat_dists):.3f}")
+with tab5:
+    st.header("Report")
+    st.subheader("1. Analogy rate")
+    analogies_file = "data/analogy.txt"
+    if os.path.exists(analogies_file):
+        acc, results = analogy_accuracy(model, analogies_file)
+        st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
+        st.dataframe(pd.DataFrame(results))
+    else:
+        st.warning("File `analogy.txt` not found.")
+    st.subheader("2. Average synonyms similarity")
+    sim_file = "data/synonyms.txt"
+    if os.path.exists(sim_file):
+        avg_sim = avg_similarity(model, sim_file)
+        st.metric("Average similarity", f"{avg_sim:.4f}")
+    else:
+        st.warning("File `similarity_words.txt` not found.")
+    st.subheader("3. Average antonyms similarity")
+    sim_file = "data/antonyms.txt"
+    if os.path.exists(sim_file):
+        avg_sim = avg_similarity(model, sim_file)
+        st.metric("Average similarity", f"{avg_sim:.4f}")
+    else:
+        st.warning("File `similarity_words.txt` not found.")
+    st.subheader("4. Heatmap for nearest words")
+    query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
+    if st.button("Build heatmap"):
+        try:
+            vectors = [model[w] for w in query_words]
+            sims = cosine_similarity(vectors)
+            fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
+            st.plotly_chart(fig)
+        except KeyError as e:
+            st.error(f"Error: {e}")
+    st.subheader("5. 2D projection")
+    sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
+    word_list = sample_words.split()
+    if st.button("Show clusters"):
+        try:
+            from sklearn.manifold import TSNE
+            vectors = np.array([model[w] for w in word_list])
+            tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
+            embedded = tsne.fit_transform(vectors)
+            fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
+            fig.update_traces(textposition='top center')
+            st.plotly_chart(fig)
+        except KeyError as e:
+            st.error(f"Word not found: {e}")