# Mini app Streamlit pour explorer Word2Vec (gensim KeyedVectors) from __future__ import annotations import os import re from typing import List, Optional, Tuple import numpy as np import pandas as pd import streamlit as st try: from gensim.models import KeyedVectors, Word2Vec except Exception: KeyedVectors = None Word2Vec = None try: from sklearn.decomposition import PCA except Exception: PCA = None try: import plotly.express as px except Exception: px = None st.set_page_config(page_title="Word2Vec Playground", layout="wide") # ----------------------------- # Helpers # ----------------------------- def _norm_spaces(s: str) -> str: return re.sub(r"\s+", " ", (s or "").strip()) def parse_word_list(s: str) -> List[str]: """ Parse "king, woman" or "king woman" or one-per-line into a list of tokens. """ s = _norm_spaces(s.replace("\n", " ").replace("\t", " ")) if not s: return [] # split on comma OR spaces parts = re.split(r"[,\s]+", s) return [p for p in (p.strip() for p in parts) if p] def in_vocab(model, word: str) -> bool: if model is None or not word: return False return word in model.key_to_index def safe_most_similar(model, positive: List[str], negative: List[str], topn: int = 10): if model is None: return [], ["Aucun modèle chargé."] missing = [w for w in positive + negative if w and not in_vocab(model, w)] if missing: return [], [f"Mots hors vocabulaire : {', '.join(missing)}"] try: res = model.most_similar(positive=positive or None, negative=negative or None, topn=topn) return res, [] except Exception as e: return [], [f"Erreur: {e}"] def safe_similarity(model, w1: str, w2: str) -> Tuple[Optional[float], List[str]]: if model is None: return None, ["Aucun modèle chargé."] missing = [w for w in [w1, w2] if w and not in_vocab(model, w)] if missing: return None, [f"Mots hors vocabulaire : {', '.join(missing)}"] try: return float(model.similarity(w1, w2)), [] except Exception as e: return None, [f"Erreur: {e}"] def to_df_similar(items: List[Tuple[str, float]]) -> pd.DataFrame: if not items: return pd.DataFrame(columns=["mot", "similarité_cosine"]) return pd.DataFrame(items, columns=["mot", "similarité_cosine"]) def tokenizer(text) -> List[str]: """Remove ^, lowercase, strip punctuation, tokenize (words 2+ chars).""" if pd.isna(text): return [] s = str(text).replace("^", " ").lower() return re.findall(r"\b\w{2,}\b", s) def load_uploaded_file(uploaded_file) -> Optional[pd.DataFrame]: """Load CSV, Excel, or JSON file from Streamlit upload.""" if uploaded_file is None: return None name = (uploaded_file.name or "").lower() try: if name.endswith(".csv"): return pd.read_csv(uploaded_file) if name.endswith((".xlsx", ".xls")): return pd.read_excel(uploaded_file) if name.endswith(".json"): return pd.read_json(uploaded_file) except Exception: pass return None # ----------------------------- # Model loading # ----------------------------- @st.cache_resource(show_spinner=False) def load_local_kv(path: str, binary: bool): if KeyedVectors is None: raise RuntimeError("gensim n'est pas disponible. Installez `gensim`.") # Heuristique: si l'extension est .kv/.model -> KeyedVectors.load lower = path.lower() if lower.endswith(".kv") or lower.endswith(".model"): return KeyedVectors.load(path) # Sinon, on tente le format word2vec (txt ou bin) return KeyedVectors.load_word2vec_format(path, binary=binary) def _get_local_model_files() -> List[str]: """Return paths to .model files in the script's directory.""" root = os.path.dirname(os.path.abspath(__file__)) if not os.path.isdir(root): return [] return sorted( os.path.join(root, f) for f in os.listdir(root) if f.lower().endswith(".model") ) # ----------------------------- # UI # ----------------------------- st.title("Word2Vec Playground") st.caption("Exploration de modèles d'embeddings word2vec (gensim KeyedVectors) : vecteur d'un mot, similarité, voisins, analogies.") with st.sidebar: st.header("1) Choisir un modèle") model_files = _get_local_model_files() if not model_files: st.warning("Aucun fichier .model trouvé dans le répertoire du script.") st.session_state["kv_model"] = None st.session_state["kv_label"] = None else: labels = [os.path.basename(p) for p in model_files] selected_label = st.radio("Modèle", labels, index=0) selected_path = model_files[labels.index(selected_label)] if st.session_state.get("kv_model_path") != selected_path: with st.spinner(f"Chargement de {selected_label}…"): try: model = load_local_kv(selected_path, binary=False) st.session_state["kv_model"] = model st.session_state["kv_label"] = selected_label st.session_state["kv_model_path"] = selected_path except Exception as e: st.error(str(e)) st.session_state["kv_model"] = None st.session_state["kv_label"] = None st.session_state["kv_model_path"] = None model = st.session_state.get("kv_model") model_label = st.session_state.get("kv_label") # Header model info info_cols = st.columns([2, 2, 3]) with info_cols[0]: st.metric("Modèle", model_label or "—") with info_cols[1]: if model is None: st.metric("Vocabulaire", "—") else: st.metric("Vocabulaire", f"{len(model.key_to_index):,}".replace(",", " ")) with info_cols[2]: if model is None: st.metric("Dimension", "—") else: st.metric("Dimension", f"{model.vector_size}") st.divider() # Tabs for intro operations tab1, tab2, tab3, tab4, tab5 = st.tabs([ "Vecteur d'un mot", "Similarité (cosine)", "Mots les plus similaires", "Analogie sémantique", "Modèle word2vec local", ]) with tab1: st.subheader("Obtenir le vecteur d'un mot") st.write("Donne le vecteur (embedding) associé à un mot, et quelques stats simples.") colA, colB = st.columns([2, 3]) with colA: word = st.text_input("Mot", value="queen") show_n = st.slider("Afficher les n premières dimensions", 5, 50, 15) with colB: if model is None: st.info("Charger un modèle dans la barre latérale.") else: if not in_vocab(model, word): st.warning("Mot hors vocabulaire pour ce modèle.") else: vec = model[word] st.write(f"Norme L2: `{np.linalg.norm(vec):.4f}` | min: `{vec.min():.4f}` | max: `{vec.max():.4f}`") st.code(np.array2string(vec[:show_n], precision=4, separator=", "), language="text") with tab2: st.subheader("Mesurer la similarité entre 2 mots (cosine)") colA, colB, colC = st.columns([2, 2, 2]) with colA: w1 = st.text_input("Mot 1", value="paris") with colB: w2 = st.text_input("Mot 2", value="book") with colC: topn = st.slider("Top-N (optionnel pour contexte)", 3, 20, 8) if model is None: st.info("Charge un modèle dans la barre latérale.") else: sim, errs = safe_similarity(model, w1, w2) if errs: for e in errs: st.warning(e) else: st.success(f"Similarité cosine({w1}, {w2}) = **{sim:.4f}**") # petit bonus : montre les voisins de chaque mot with st.expander("Voir les voisins de chaque mot"): a, _ = safe_most_similar(model, [w1], [], topn=topn) b, _ = safe_most_similar(model, [w2], [], topn=topn) c1, c2 = st.columns(2) with c1: st.write(f"Voisins de **{w1}**") st.dataframe(to_df_similar(a), use_container_width=True, hide_index=True) with c2: st.write(f"Voisins de **{w2}**") st.dataframe(to_df_similar(b), use_container_width=True, hide_index=True) with tab3: st.subheader("Trouver les mots les plus similaires à un mot donné") colA, colB = st.columns([2, 2]) with colA: query = st.text_input("Mot cible", value="queen") with colB: topn = st.slider("Top-N", 3, 50, 10) if model is None: st.info("Charge un modèle dans la barre latérale.") else: res, errs = safe_most_similar(model, [query], [], topn=topn) if errs: for e in errs: st.warning(e) else: st.dataframe(to_df_similar(res), use_container_width=True, hide_index=True) with tab4: st.subheader("Analogie sémantique (positive - negative)") st.write( "Exemple classique : `king - man + woman ≈ queen`.\n\n" "Saisir plusieurs mots (séparés par virgule ou espaces)." ) colA, colB, colC = st.columns([2, 2, 1]) with colA: positive_s = st.text_area("Positive (+)", value="king, woman", height=80) with colB: negative_s = st.text_area("Negative (-)", value="man", height=80) with colC: topn = st.slider("Top-N", 3, 20, 5) positive = parse_word_list(positive_s) negative = parse_word_list(negative_s) if model is None: st.info("Charge un modèle dans la barre latérale.") else: res, errs = safe_most_similar(model, positive, negative, topn=topn) if errs: for e in errs: st.warning(e) else: st.dataframe(to_df_similar(res), use_container_width=True, hide_index=True) with tab5: st.subheader("Modèle Word2Vec local depuis CSV / Excel / JSON") st.write( "Uploader un fichier, sélectionner les colonnes à encoder, " "entraîner un Word2Vec et visualiser les embeddings (PCA)." ) uploaded = st.file_uploader( "Fichier CSV, Excel ou JSON", type=["csv", "xlsx", "xls", "json"], key="w2v_local_upload", ) df_upload = load_uploaded_file(uploaded) if uploaded is not None and df_upload is None: st.error("Impossible de charger le fichier. Vérifie le format (CSV, Excel, JSON).") elif df_upload is not None and not df_upload.empty: st.dataframe(df_upload.head(100), use_container_width=True, hide_index=True) st.caption(f"Aperçu des {min(100, len(df_upload))} premières lignes sur {len(df_upload)}.") text_cols = [c for c in df_upload.columns if df_upload[c].dtype == "object" or df_upload[c].dtype.name == "string"] if not text_cols: text_cols = list(df_upload.columns) selected_cols = st.multiselect( "Colonnes à encoder (concaténées)", options=text_cols, default=text_cols[:1] if text_cols else [], key="w2v_local_cols", ) col_a, col_b = st.columns(2) with col_a: vector_size = st.slider("Dimension des vecteurs", 50, 300, 100) min_count = st.slider("min_count", 1, 10, 2) with col_b: window = st.slider("window", 2, 15, 5) sg = st.radio("Algorithme", ["CBOW (sg=0)", "Skip-gram (sg=1)"], index=1, horizontal=True) if selected_cols and st.button("Entraîner Word2Vec et visualiser PCA", type="primary", key="w2v_train_btn"): concat = df_upload[selected_cols].fillna("").astype(str).agg(" ".join, axis=1) tokenized = concat.apply(tokenizer) tokenized = [t for t in tokenized if t] if len(tokenized) < 2: st.warning("Pas assez de texte tokenisé. Vérifie les colonnes sélectionnées.") elif Word2Vec is None: st.error("gensim Word2Vec non disponible.") elif PCA is None: st.error("scikit-learn PCA non disponible.") elif px is None: st.error("plotly non disponible.") else: with st.spinner("Entraînement Word2Vec…"): w2v = Word2Vec( sentences=tokenized, vector_size=vector_size, window=window, min_count=min_count, sg=1 if "Skip-gram" in sg else 0, workers=4, ) words = list(w2v.wv.index_to_key) if len(words) < 2: st.warning("Vocabulaire trop petit après entraînement.") else: vectors = np.array([w2v.wv[w] for w in words]) with st.spinner("Réduction PCA…"): pca = PCA(n_components=2) reduced = pca.fit_transform(vectors) fig = px.scatter( x=reduced[:, 0], y=reduced[:, 1], text=words, title="Embeddings Word2Vec (PCA 2D)", ) fig.update_traces(textposition="top center", mode="markers+text", textfont_size=10) fig.update_layout( xaxis_title="PC1", yaxis_title="PC2", height=600, showlegend=False, ) st.plotly_chart(fig, use_container_width=True) st.caption(f"Vocabulaire : {len(words)} mots | variance expliquée : {pca.explained_variance_ratio_.sum():.1%}") elif df_upload is not None and df_upload.empty: st.warning("Le fichier est vide.") else: st.info("Uploader un fichier CSV, Excel ou JSON pour commencer.") st.divider() with st.expander("Idées d'usage 'métier'"): st.markdown( """ - **Enrichir un vocabulaire** : donner un terme (ex. *biologie*, *archives*, *catalogage*) et explorer les termes voisins. - **Détecter des variantes** : synonymes, quasi-synonymes, noms propres proches, formes dérivées. - **Comparaison de modèles** : utiliser 2 modèles successivement et comparer les différences : l'espace dépend du corpus ! """ ) st.divider() with st.expander("Code source"): code = ''' #!uv pip install -U gensim import gensim.downloader as api #====================================================== # Liste des modèles disponibles print(list(api.info()['models'].keys())) #['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'] #====================================================== # Charger un modèle model = api.load("glove-wiki-gigaword-50") # Obtenir le vecteur d'un mot vec = model["queen"] print(vec) # Obtenir les voisins d'un mot neighbors = model.most_similar("queen") print(neighbors) # Obtenir la similarité entre 2 mots similarity = model.similarity("queen", "woman") print(similarity) # Obtenir les analogies sémantiques analogies = model.most_similar(positive=["king", "woman"], negative=["man"]) print(analogies) ''' st.code(code, language="python")