Spaces:

theformatisvalid
/

vectorizers_analysis

Sleeping

App Files Files Community

theformatisvalid commited on Nov 8, 2025

Commit

43137ec

verified ·

1 Parent(s): 0b9dd2a

Delete main.py

Browse files

Files changed (1) hide show

main.py +0 -452

main.py DELETED Viewed

@@ -1,452 +0,0 @@
-import fasttext
-import streamlit as st
-import numpy as np
-import pandas as pd
-from gensim.models import Word2Vec
-from sklearn.metrics.pairwise import cosine_similarity
-import plotly.express as px
-import plotly.graph_objects as go
-from collections import Counter
-import os
-import glob
-class UnifiedVectorModel:
-    def __init__(self, backend_model, model_type="w2v"):
-        self.model = backend_model
-        self.model_type = model_type.lower()
-        if self.model_type == "w2v":
-            self.wv = backend_model.wv
-            self.key_to_index = self.wv.key_to_index
-            self.vector_size = self.wv.vector_size
-            self._words = set(self.wv.key_to_index.keys())
-        elif self.model_type == "ft":
-            # Для fasttext-wheel
-            self.key_to_index = {word: i for i, word in enumerate(backend_model.get_words())}
-            self.vector_size = backend_model.get_dimension()
-            self._words = set(self.key_to_index.keys())
-        else:
-            raise ValueError("model_type must be 'w2v' or 'ft'")
-    def __contains__(self, word):
-        return word in self._words
-    def __getitem__(self, word):
-        if self.model_type == "w2v":
-            return self.wv[word]
-        elif self.model_type == "ft":
-            return self.model.get_word_vector(word)
-    def most_similar(self, positive=None, negative=None, topn=10):
-        from sklearn.metrics.pairwise import cosine_similarity
-        if not positive:
-            positive = []
-        if not negative:
-            negative = []
-        try:
-            if self.model_type == "w2v":
-                return self.wv.most_similar(positive=positive, negative=negative, topn=topn)
-            elif self.model_type == "ft":
-                vec = np.zeros(self.vector_size)
-                for w in positive:
-                    if w in self:
-                        vec += self[w]
-                    else:
-                        continue
-                for w in negative:
-                    if w in self:
-                        vec -= self[w]
-                    else:
-                        continue
-                if np.allclose(vec, 0):
-                    return []
-                words = list(self._words)
-                vectors = np.array([self[w] for w in words])
-                sims = cosine_similarity([vec], vectors)[0]
-                best = np.argsort(sims)[::-1][:topn + len(positive) + len(negative)]
-                result = []
-                for i in best:
-                    word = words[i]
-                    if word not in positive and word not in negative:
-                        result.append((word, float(sims[i])))
-                        if len(result) >= topn:
-                            break
-                return result
-        except Exception as e:
-            print(f"Error in most_similar: {e}")
-            return []
-    def similar_by_vector(self, vector, topn=10):
-        from sklearn.metrics.pairwise import cosine_similarity
-        words = list(self._words)
-        vectors = np.array([self[w] for w in words])
-        sims = cosine_similarity([vector], vectors)[0]
-        best = np.argsort(sims)[::-1][:topn]
-        return [(words[i], float(sims[i])) for i in best]
-    def get_words(self):
-        return list(self._words)
-    @property
-    def vectors(self):
-        if not hasattr(self, '_cached_vectors'):
-            words = list(self._words)
-            self._cached_words = words
-            self._cached_vectors = np.array([self[w] for w in words])
-        return self._cached_vectors
-    @property
-    def index_to_key(self):
-        if not hasattr(self, '_index_to_key'):
-            self._index_to_key = list(self._words)
-        return self._index_to_key
-@st.cache_resource
-def load_model(model_path):
-    try:
-        if model_path.endswith(".model"):
-            raw_model = Word2Vec.load(model_path)
-            current_model = UnifiedVectorModel(raw_model, model_type="w2v")
-        elif model_path.endswith(".bin"):
-            raw_model = fasttext.load_model(model_path)
-            current_model = UnifiedVectorModel(raw_model, model_type="ft")
-        else:
-            raise ValueError(f"wrong path format")
-        return current_model
-    except Exception as e:
-        st.error(f"error loading model {model_path}: {e}")
-        return None
-MODELS_DIR = "models"
-if not os.path.exists(MODELS_DIR):
-    st.error(f"Folder `{MODELS_DIR}` not found.")
-    st.stop()
-model_files = []
-for ext in ["*.bin", "*.model", "*.vec"]:
-    model_files.extend(glob.glob(os.path.join(MODELS_DIR, ext)))
-model_files = [f for f in model_files if os.path.isfile(f)]
-model_names = [os.path.basename(f) for f in model_files]
-if len(model_names) == 0:
-    st.error(f"No models in folder `{MODELS_DIR}` (.bin, .model, .vec).")
-    st.info("Supported formats: Word2Vec (binary/text), FastText.")
-    st.stop()
-selected_model_name = st.sidebar.selectbox(
-    "Choose pretrained model",
-    model_names
-)
-selected_model_path = os.path.join(MODELS_DIR, selected_model_name)
-st.sidebar.info(f"loading: `{selected_model_name}`")
-model = load_model(selected_model_path)
-if model is None:
-    st.stop()
-else:
-    st.sidebar.success(f"Model '{selected_model_name}' loaded")
-    st.sidebar.write(f"Voc size: {len(model.key_to_index):,}")
-    st.sidebar.write(f"Vector size: {model.vector_size}")
-def analogy_accuracy(model, file_name):
-    right = 0
-    count = 0
-    results = []
-    with open(file_name, encoding='utf-8') as file:
-        for line in file:
-            words = line.strip().split()
-            if len(words) != 4:
-                continue
-            try:
-                most_similar = model.most_similar(positive=[words[0], words[2]], negative=[words[1]], topn=10)
-                predicted = [x[0] for x in most_similar]
-                correct = words[3]
-                if correct in predicted:
-                    rank = predicted.index(correct) + 1
-                    right += 1
-                else:
-                    rank = None
-                count += 1
-                results.append({
-                    "query": f"{words[0]} - {words[1]} + {words[2]}",
-                    "target": correct,
-                    "predicted": predicted[0],
-                    "rank": rank,
-                    "in_top10": bool(rank)
-                })
-            except KeyError as e:
-                continue
-    accuracy = right / count if count > 0 else 0
-    return accuracy, results
-def avg_similarity(model, file_name):
-    res = []
-    with open(file_name, encoding='utf-8') as file:
-        for line in file:
-            words = line.strip().split()
-            try:
-                vectors = [model[word] for word in words]
-            except KeyError:
-                continue
-            sims = cosine_similarity(vectors)
-            for i in range(len(words) - 1):
-                for j in range(i + 1, len(words)):
-                    res.append(sims[i][j])
-    return sum(res) / len(res) if res else 0
-def projection(word_vec, axis):
-    axis_norm = axis / np.linalg.norm(axis)
-    return np.dot(word_vec, axis_norm)
-def get_projection_row(model, axis):
-    words = list(model.key_to_index.keys())
-    projections = [(word, projection(model[word], axis)) for word in words]
-    projections = sorted(projections, key=lambda x: x[1])
-    return projections
-st.title("Vector embeddings")
-tab1, tab2, tab3, tab4, tab5 = st.tabs([
-    "Vector ariphmetics",
-    "Semantic consistency",
-    "Semantic axis",
-    "Distribution analysis",
-    "Report"
-])
-with tab1:
-    st.header("Vector ariphmetics")
-    expr = st.text_input("Insert expression", value="рубль - россия + сша")
-    if st.button("Compute"):
-        words = expr.replace('+', ' + ').replace('-', ' - ').split()
-        positive, negative = [], []
-        current = 'pos'
-        for w in words:
-            if w == '+':
-                current = 'pos'
-            elif w == '-':
-                current = 'neg'
-            else:
-                (positive if current == 'pos' else negative).append(w)
-        missing = [w for w in positive + negative if w not in model]
-        if missing:
-            st.warning(f"Words not found in voc: {', '.join(missing)}")
-            st.stop()
-        try:
-            similar = model.most_similar(
-                positive=positive,
-                negative=negative,
-                topn=10
-            )
-            st.write("### Result:")
-            result_words = [f"{w} ({s:.3f})" for w, s in similar]
-            st.write("Nearest words: " + ", ".join(result_words))
-            st.write("### In-between steps")
-            cum_vec = np.zeros(model.vector_size)
-            steps_data = []
-            for i in range(len(positive)):
-                cum_vec += model[w]
-                nearest = model.most_similar(positive=positive[:i + 1], topn=1)
-                steps_data.append({
-                    "step": f"+ {positive[i]}",
-                    "nearest word": nearest[0][0],
-                    "similarity": nearest[0][1]
-                })
-            for i in range(len(negative)):
-                cum_vec -= model[w]
-                nearest = model.most_similar(positive=positive, negative=negative[:i + 1], topn=1)
-                steps_data.append({
-                    "step": f"- {negative[i]}",
-                    "nearest word": nearest[0][0],
-                    "similarity": nearest[0][1]
-                })
-            df_steps = pd.DataFrame(steps_data)
-            st.dataframe(df_steps[["step", "nearest word", "similarity"]])
-            result_word = similar[0][0]
-            fig = px.scatter(
-                x=[cum_vec[0]], y=[cum_vec[1]],
-                text=[result_word],
-                title="Result (first 2 components)"
-            )
-            fig.update_traces(textposition='top center', marker=dict(size=12, color='red'))
-            st.plotly_chart(fig)
-        except Exception as e:
-            st.error(f"Error computing: {e}")
-with tab2:
-    st.header("Similarity calculator")
-    col1, col2 = st.columns(2)
-    with col1:
-        word1 = st.text_input("word 1", value="мужчина")
-    with col2:
-        word2 = st.text_input("word 2", value="женщина")
-    if st.button("Compute similarity"):
-        try:
-            v1, v2 = model[word1], model[word2]
-            sim = cosine_similarity([v1], [v2])[0][0]
-            st.metric("Cosine similarity", f"{sim:.4f}")
-            st.write("### Nearest neighbors graph")
-            neighbors = model.most_similar(word1, topn=5) + model.most_similar(word2, topn=5)
-            nodes = list(set([word1, word2] + [n[0] for n in neighbors]))
-            edges = [(word1, n[0]) for n in model.most_similar(word1, topn=5)] + \
-                    [(word2, n[0]) for n in model.most_similar(word2, topn=5)]
-            G = go.Figure()
-            pos = np.random.rand(len(nodes), 2) * 2 - 1
-            node_x = pos[:, 0]
-            node_y = pos[:, 1]
-            for edge in edges:
-                x0, y0 = pos[nodes.index(edge[0])]
-                x1, y1 = pos[nodes.index(edge[1])]
-                G.add_trace(go.Scatter(x=[x0, x1], y=[y0, y1], mode='lines', line=dict(width=1, color='gray'), showlegend=False))
-            G.add_trace(go.Scatter(x=node_x, y=node_y, mode='text+markers',
-                                   marker=dict(size=10, color='lightblue'),
-                                   text=nodes, textposition="top center"))
-            G.update_layout(title="Semantic links graph", showlegend=False)
-            st.plotly_chart(G)
-        except KeyError as e:
-            st.error(f"Word not found: {e}")
-with tab3:
-    st.header("Semantic axis projection")
-    col1, col2 = st.columns(2)
-    with col1:
-        pos_axis = st.text_input("positive", value="мужчина")
-    with col2:
-        neg_axis = st.text_input("negative", value="женщина")
-    if st.button("Build axis"):
-        try:
-            pos_vec = model[pos_axis]
-            neg_vec = model[neg_axis]
-            axis = pos_vec - neg_vec
-            projections = get_projection_row(model, axis)
-            top_pos = projections[-10:][::-1]
-            top_neg = projections[:10]
-            st.write(f"Axis: **{pos_axis} – {neg_axis}**")
-            st.write("### Top 10 positive:")
-            st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_pos]))
-            st.write("### Top 10 negative:")
-            st.write(", ".join([f"{w} ({p:.3f})" for w, p in top_neg]))
-            df_proj = pd.DataFrame(top_pos + top_neg, columns=["word", "projection"])
-            fig = px.bar(df_proj, x="projection", y="word", orientation='h', title=f"Projection on axis: {pos_axis}–{neg_axis}")
-            st.plotly_chart(fig)
-        except KeyError as e:
-            st.error(f"Error: {e}")
-with tab4:
-    st.header("Distance distribution analysis")
-    all_vectors = model.vectors
-    sample = all_vectors[np.random.choice(all_vectors.shape[0], 1000, replace=False)]
-    dists = cosine_similarity(sample)
-    np.fill_diagonal(dists, 0)
-    flat_dists = dists.flatten()
-    flat_dists = flat_dists[flat_dists > 0]
-    fig = px.histogram(flat_dists, nbins=50, title="Cosine similarity distribution between random words")
-    st.plotly_chart(fig)
-    st.metric("Mean similarity", f"{np.mean(flat_dists):.3f}")
-    st.metric("Std deviation", f"{np.std(flat_dists):.3f}")
-with tab5:
-    st.header("Report")
-    st.subheader("1. Analogy rate")
-    analogies_file = "data/analogy.txt"
-    if os.path.exists(analogies_file):
-        acc, results = analogy_accuracy(model, analogies_file)
-        st.metric("Analogy accuracy (in top 10)", f"{acc:.2%}")
-        st.dataframe(pd.DataFrame(results))
-    else:
-        st.warning("File `analogy.txt` not found.")
-    st.subheader("2. Average synonyms similarity")
-    sim_file = "data/synonyms.txt"
-    if os.path.exists(sim_file):
-        avg_sim = avg_similarity(model, sim_file)
-        st.metric("Average similarity", f"{avg_sim:.4f}")
-    else:
-        st.warning("File `similarity_words.txt` not found.")
-    st.subheader("3. Average antonyms similarity")
-    sim_file = "data/antonyms.txt"
-    if os.path.exists(sim_file):
-        avg_sim = avg_similarity(model, sim_file)
-        st.metric("Average similarity", f"{avg_sim:.4f}")
-    else:
-        st.warning("File `similarity_words.txt` not found.")
-    st.subheader("4. Heatmap for nearest words")
-    query_words = st.text_input("Enter words", value="мужчина женщина мальчик девочка").split()
-    if st.button("Build heatmap"):
-        try:
-            vectors = [model[w] for w in query_words]
-            sims = cosine_similarity(vectors)
-            fig = px.imshow(sims, x=query_words, y=query_words, color_continuous_scale="Blues", title="Similarity heatmap")
-            st.plotly_chart(fig)
-        except KeyError as e:
-            st.error(f"Error: {e}")
-    st.subheader("5. 2D projection")
-    sample_words = st.text_input("Input words", value="мужчина женщина мальчик девочка")
-    word_list = sample_words.split()
-    if st.button("Show clusters"):
-        try:
-            from sklearn.manifold import TSNE
-            vectors = np.array([model[w] for w in word_list])
-            tsne = TSNE(n_components=2, perplexity=len(vectors) - 1, random_state=42)
-            embedded = tsne.fit_transform(vectors)
-            fig = px.scatter(x=embedded[:, 0], y=embedded[:, 1], text=word_list, title="words projection")
-            fig.update_traces(textposition='top center')
-            st.plotly_chart(fig)
-        except KeyError as e:
-            st.error(f"Word not found: {e}")