Spaces:
Sleeping
Sleeping
| # Mini app Streamlit pour explorer Word2Vec (gensim KeyedVectors) | |
| from __future__ import annotations | |
| import os | |
| import re | |
| from typing import List, Optional, Tuple | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| try: | |
| from gensim.models import KeyedVectors, Word2Vec | |
| except Exception: | |
| KeyedVectors = None | |
| Word2Vec = None | |
| try: | |
| from sklearn.decomposition import PCA | |
| except Exception: | |
| PCA = None | |
| try: | |
| import plotly.express as px | |
| except Exception: | |
| px = None | |
| st.set_page_config(page_title="Word2Vec Playground", layout="wide") | |
| # ----------------------------- | |
| # Helpers | |
| # ----------------------------- | |
| def _norm_spaces(s: str) -> str: | |
| return re.sub(r"\s+", " ", (s or "").strip()) | |
| def parse_word_list(s: str) -> List[str]: | |
| """ | |
| Parse "king, woman" or "king woman" or one-per-line into a list of tokens. | |
| """ | |
| s = _norm_spaces(s.replace("\n", " ").replace("\t", " ")) | |
| if not s: | |
| return [] | |
| # split on comma OR spaces | |
| parts = re.split(r"[,\s]+", s) | |
| return [p for p in (p.strip() for p in parts) if p] | |
| def in_vocab(model, word: str) -> bool: | |
| if model is None or not word: | |
| return False | |
| return word in model.key_to_index | |
| def safe_most_similar(model, positive: List[str], negative: List[str], topn: int = 10): | |
| if model is None: | |
| return [], ["Aucun modèle chargé."] | |
| missing = [w for w in positive + negative if w and not in_vocab(model, w)] | |
| if missing: | |
| return [], [f"Mots hors vocabulaire : {', '.join(missing)}"] | |
| try: | |
| res = model.most_similar(positive=positive or None, negative=negative or None, topn=topn) | |
| return res, [] | |
| except Exception as e: | |
| return [], [f"Erreur: {e}"] | |
| def safe_similarity(model, w1: str, w2: str) -> Tuple[Optional[float], List[str]]: | |
| if model is None: | |
| return None, ["Aucun modèle chargé."] | |
| missing = [w for w in [w1, w2] if w and not in_vocab(model, w)] | |
| if missing: | |
| return None, [f"Mots hors vocabulaire : {', '.join(missing)}"] | |
| try: | |
| return float(model.similarity(w1, w2)), [] | |
| except Exception as e: | |
| return None, [f"Erreur: {e}"] | |
| def to_df_similar(items: List[Tuple[str, float]]) -> pd.DataFrame: | |
| if not items: | |
| return pd.DataFrame(columns=["mot", "similarité_cosine"]) | |
| return pd.DataFrame(items, columns=["mot", "similarité_cosine"]) | |
| def tokenizer(text) -> List[str]: | |
| """Remove ^, lowercase, strip punctuation, tokenize (words 2+ chars).""" | |
| if pd.isna(text): | |
| return [] | |
| s = str(text).replace("^", " ").lower() | |
| return re.findall(r"\b\w{2,}\b", s) | |
| def load_uploaded_file(uploaded_file) -> Optional[pd.DataFrame]: | |
| """Load CSV, Excel, or JSON file from Streamlit upload.""" | |
| if uploaded_file is None: | |
| return None | |
| name = (uploaded_file.name or "").lower() | |
| try: | |
| if name.endswith(".csv"): | |
| return pd.read_csv(uploaded_file) | |
| if name.endswith((".xlsx", ".xls")): | |
| return pd.read_excel(uploaded_file) | |
| if name.endswith(".json"): | |
| return pd.read_json(uploaded_file) | |
| except Exception: | |
| pass | |
| return None | |
| # ----------------------------- | |
| # Model loading | |
| # ----------------------------- | |
| def load_local_kv(path: str, binary: bool): | |
| if KeyedVectors is None: | |
| raise RuntimeError("gensim n'est pas disponible. Installez `gensim`.") | |
| # Heuristique: si l'extension est .kv/.model -> KeyedVectors.load | |
| lower = path.lower() | |
| if lower.endswith(".kv") or lower.endswith(".model"): | |
| return KeyedVectors.load(path) | |
| # Sinon, on tente le format word2vec (txt ou bin) | |
| return KeyedVectors.load_word2vec_format(path, binary=binary) | |
| def _get_local_model_files() -> List[str]: | |
| """Return paths to .model files in the script's directory.""" | |
| root = os.path.dirname(os.path.abspath(__file__)) | |
| if not os.path.isdir(root): | |
| return [] | |
| return sorted( | |
| os.path.join(root, f) | |
| for f in os.listdir(root) | |
| if f.lower().endswith(".model") | |
| ) | |
| # ----------------------------- | |
| # UI | |
| # ----------------------------- | |
| st.title("Word2Vec Playground") | |
| st.caption("Exploration de modèles d'embeddings word2vec (gensim KeyedVectors) : vecteur d'un mot, similarité, voisins, analogies.") | |
| with st.sidebar: | |
| st.header("1) Choisir un modèle") | |
| model_files = _get_local_model_files() | |
| if not model_files: | |
| st.warning("Aucun fichier .model trouvé dans le répertoire du script.") | |
| st.session_state["kv_model"] = None | |
| st.session_state["kv_label"] = None | |
| else: | |
| labels = [os.path.basename(p) for p in model_files] | |
| selected_label = st.radio("Modèle", labels, index=0) | |
| selected_path = model_files[labels.index(selected_label)] | |
| if st.session_state.get("kv_model_path") != selected_path: | |
| with st.spinner(f"Chargement de {selected_label}…"): | |
| try: | |
| model = load_local_kv(selected_path, binary=False) | |
| st.session_state["kv_model"] = model | |
| st.session_state["kv_label"] = selected_label | |
| st.session_state["kv_model_path"] = selected_path | |
| except Exception as e: | |
| st.error(str(e)) | |
| st.session_state["kv_model"] = None | |
| st.session_state["kv_label"] = None | |
| st.session_state["kv_model_path"] = None | |
| model = st.session_state.get("kv_model") | |
| model_label = st.session_state.get("kv_label") | |
| # Header model info | |
| info_cols = st.columns([2, 2, 3]) | |
| with info_cols[0]: | |
| st.metric("Modèle", model_label or "—") | |
| with info_cols[1]: | |
| if model is None: | |
| st.metric("Vocabulaire", "—") | |
| else: | |
| st.metric("Vocabulaire", f"{len(model.key_to_index):,}".replace(",", " ")) | |
| with info_cols[2]: | |
| if model is None: | |
| st.metric("Dimension", "—") | |
| else: | |
| st.metric("Dimension", f"{model.vector_size}") | |
| st.divider() | |
| # Tabs for intro operations | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs([ | |
| "Vecteur d'un mot", | |
| "Similarité (cosine)", | |
| "Mots les plus similaires", | |
| "Analogie sémantique", | |
| "Modèle word2vec local", | |
| ]) | |
| with tab1: | |
| st.subheader("Obtenir le vecteur d'un mot") | |
| st.write("Donne le vecteur (embedding) associé à un mot, et quelques stats simples.") | |
| colA, colB = st.columns([2, 3]) | |
| with colA: | |
| word = st.text_input("Mot", value="queen") | |
| show_n = st.slider("Afficher les n premières dimensions", 5, 50, 15) | |
| with colB: | |
| if model is None: | |
| st.info("Charger un modèle dans la barre latérale.") | |
| else: | |
| if not in_vocab(model, word): | |
| st.warning("Mot hors vocabulaire pour ce modèle.") | |
| else: | |
| vec = model[word] | |
| st.write(f"Norme L2: `{np.linalg.norm(vec):.4f}` | min: `{vec.min():.4f}` | max: `{vec.max():.4f}`") | |
| st.code(np.array2string(vec[:show_n], precision=4, separator=", "), language="text") | |
| with tab2: | |
| st.subheader("Mesurer la similarité entre 2 mots (cosine)") | |
| colA, colB, colC = st.columns([2, 2, 2]) | |
| with colA: | |
| w1 = st.text_input("Mot 1", value="paris") | |
| with colB: | |
| w2 = st.text_input("Mot 2", value="book") | |
| with colC: | |
| topn = st.slider("Top-N (optionnel pour contexte)", 3, 20, 8) | |
| if model is None: | |
| st.info("Charge un modèle dans la barre latérale.") | |
| else: | |
| sim, errs = safe_similarity(model, w1, w2) | |
| if errs: | |
| for e in errs: | |
| st.warning(e) | |
| else: | |
| st.success(f"Similarité cosine({w1}, {w2}) = **{sim:.4f}**") | |
| # petit bonus : montre les voisins de chaque mot | |
| with st.expander("Voir les voisins de chaque mot"): | |
| a, _ = safe_most_similar(model, [w1], [], topn=topn) | |
| b, _ = safe_most_similar(model, [w2], [], topn=topn) | |
| c1, c2 = st.columns(2) | |
| with c1: | |
| st.write(f"Voisins de **{w1}**") | |
| st.dataframe(to_df_similar(a), use_container_width=True, hide_index=True) | |
| with c2: | |
| st.write(f"Voisins de **{w2}**") | |
| st.dataframe(to_df_similar(b), use_container_width=True, hide_index=True) | |
| with tab3: | |
| st.subheader("Trouver les mots les plus similaires à un mot donné") | |
| colA, colB = st.columns([2, 2]) | |
| with colA: | |
| query = st.text_input("Mot cible", value="queen") | |
| with colB: | |
| topn = st.slider("Top-N", 3, 50, 10) | |
| if model is None: | |
| st.info("Charge un modèle dans la barre latérale.") | |
| else: | |
| res, errs = safe_most_similar(model, [query], [], topn=topn) | |
| if errs: | |
| for e in errs: | |
| st.warning(e) | |
| else: | |
| st.dataframe(to_df_similar(res), use_container_width=True, hide_index=True) | |
| with tab4: | |
| st.subheader("Analogie sémantique (positive - negative)") | |
| st.write( | |
| "Exemple classique : `king - man + woman ≈ queen`.\n\n" | |
| "Saisir plusieurs mots (séparés par virgule ou espaces)." | |
| ) | |
| colA, colB, colC = st.columns([2, 2, 1]) | |
| with colA: | |
| positive_s = st.text_area("Positive (+)", value="king, woman", height=80) | |
| with colB: | |
| negative_s = st.text_area("Negative (-)", value="man", height=80) | |
| with colC: | |
| topn = st.slider("Top-N", 3, 20, 5) | |
| positive = parse_word_list(positive_s) | |
| negative = parse_word_list(negative_s) | |
| if model is None: | |
| st.info("Charge un modèle dans la barre latérale.") | |
| else: | |
| res, errs = safe_most_similar(model, positive, negative, topn=topn) | |
| if errs: | |
| for e in errs: | |
| st.warning(e) | |
| else: | |
| st.dataframe(to_df_similar(res), use_container_width=True, hide_index=True) | |
| with tab5: | |
| st.subheader("Modèle Word2Vec local depuis CSV / Excel / JSON") | |
| st.write( | |
| "Uploader un fichier, sélectionner les colonnes à encoder, " | |
| "entraîner un Word2Vec et visualiser les embeddings (PCA)." | |
| ) | |
| uploaded = st.file_uploader( | |
| "Fichier CSV, Excel ou JSON", | |
| type=["csv", "xlsx", "xls", "json"], | |
| key="w2v_local_upload", | |
| ) | |
| df_upload = load_uploaded_file(uploaded) | |
| if uploaded is not None and df_upload is None: | |
| st.error("Impossible de charger le fichier. Vérifie le format (CSV, Excel, JSON).") | |
| elif df_upload is not None and not df_upload.empty: | |
| st.dataframe(df_upload.head(100), use_container_width=True, hide_index=True) | |
| st.caption(f"Aperçu des {min(100, len(df_upload))} premières lignes sur {len(df_upload)}.") | |
| text_cols = [c for c in df_upload.columns if df_upload[c].dtype == "object" or df_upload[c].dtype.name == "string"] | |
| if not text_cols: | |
| text_cols = list(df_upload.columns) | |
| selected_cols = st.multiselect( | |
| "Colonnes à encoder (concaténées)", | |
| options=text_cols, | |
| default=text_cols[:1] if text_cols else [], | |
| key="w2v_local_cols", | |
| ) | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| vector_size = st.slider("Dimension des vecteurs", 50, 300, 100) | |
| min_count = st.slider("min_count", 1, 10, 2) | |
| with col_b: | |
| window = st.slider("window", 2, 15, 5) | |
| sg = st.radio("Algorithme", ["CBOW (sg=0)", "Skip-gram (sg=1)"], index=1, horizontal=True) | |
| if selected_cols and st.button("Entraîner Word2Vec et visualiser PCA", type="primary", key="w2v_train_btn"): | |
| concat = df_upload[selected_cols].fillna("").astype(str).agg(" ".join, axis=1) | |
| tokenized = concat.apply(tokenizer) | |
| tokenized = [t for t in tokenized if t] | |
| if len(tokenized) < 2: | |
| st.warning("Pas assez de texte tokenisé. Vérifie les colonnes sélectionnées.") | |
| elif Word2Vec is None: | |
| st.error("gensim Word2Vec non disponible.") | |
| elif PCA is None: | |
| st.error("scikit-learn PCA non disponible.") | |
| elif px is None: | |
| st.error("plotly non disponible.") | |
| else: | |
| with st.spinner("Entraînement Word2Vec…"): | |
| w2v = Word2Vec( | |
| sentences=tokenized, | |
| vector_size=vector_size, | |
| window=window, | |
| min_count=min_count, | |
| sg=1 if "Skip-gram" in sg else 0, | |
| workers=4, | |
| ) | |
| words = list(w2v.wv.index_to_key) | |
| if len(words) < 2: | |
| st.warning("Vocabulaire trop petit après entraînement.") | |
| else: | |
| vectors = np.array([w2v.wv[w] for w in words]) | |
| with st.spinner("Réduction PCA…"): | |
| pca = PCA(n_components=2) | |
| reduced = pca.fit_transform(vectors) | |
| fig = px.scatter( | |
| x=reduced[:, 0], | |
| y=reduced[:, 1], | |
| text=words, | |
| title="Embeddings Word2Vec (PCA 2D)", | |
| ) | |
| fig.update_traces(textposition="top center", mode="markers+text", textfont_size=10) | |
| fig.update_layout( | |
| xaxis_title="PC1", | |
| yaxis_title="PC2", | |
| height=600, | |
| showlegend=False, | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.caption(f"Vocabulaire : {len(words)} mots | variance expliquée : {pca.explained_variance_ratio_.sum():.1%}") | |
| elif df_upload is not None and df_upload.empty: | |
| st.warning("Le fichier est vide.") | |
| else: | |
| st.info("Uploader un fichier CSV, Excel ou JSON pour commencer.") | |
| st.divider() | |
| with st.expander("Idées d'usage 'métier'"): | |
| st.markdown( | |
| """ | |
| - **Enrichir un vocabulaire** : donner un terme (ex. *biologie*, *archives*, *catalogage*) et explorer les termes voisins. | |
| - **Détecter des variantes** : synonymes, quasi-synonymes, noms propres proches, formes dérivées. | |
| - **Comparaison de modèles** : utiliser 2 modèles successivement et comparer les différences : l'espace dépend du corpus ! | |
| """ | |
| ) | |
| st.divider() | |
| with st.expander("Code source"): | |
| code = ''' | |
| #!uv pip install -U gensim | |
| import gensim.downloader as api | |
| #====================================================== | |
| # Liste des modèles disponibles | |
| print(list(api.info()['models'].keys())) | |
| #['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'] | |
| #====================================================== | |
| # Charger un modèle | |
| model = api.load("glove-wiki-gigaword-50") | |
| # Obtenir le vecteur d'un mot | |
| vec = model["queen"] | |
| print(vec) | |
| # Obtenir les voisins d'un mot | |
| neighbors = model.most_similar("queen") | |
| print(neighbors) | |
| # Obtenir la similarité entre 2 mots | |
| similarity = model.similarity("queen", "woman") | |
| print(similarity) | |
| # Obtenir les analogies sémantiques | |
| analogies = model.most_similar(positive=["king", "woman"], negative=["man"]) | |
| print(analogies) | |
| ''' | |
| st.code(code, language="python") |