demo-word2vec / src /streamlit_app.py
Geraldine's picture
Update src/streamlit_app.py
1a2d39d verified
# Mini app Streamlit pour explorer Word2Vec (gensim KeyedVectors)
from __future__ import annotations
import os
import re
from typing import List, Optional, Tuple
import numpy as np
import pandas as pd
import streamlit as st
try:
from gensim.models import KeyedVectors, Word2Vec
except Exception:
KeyedVectors = None
Word2Vec = None
try:
from sklearn.decomposition import PCA
except Exception:
PCA = None
try:
import plotly.express as px
except Exception:
px = None
st.set_page_config(page_title="Word2Vec Playground", layout="wide")
# -----------------------------
# Helpers
# -----------------------------
def _norm_spaces(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip())
def parse_word_list(s: str) -> List[str]:
"""
Parse "king, woman" or "king woman" or one-per-line into a list of tokens.
"""
s = _norm_spaces(s.replace("\n", " ").replace("\t", " "))
if not s:
return []
# split on comma OR spaces
parts = re.split(r"[,\s]+", s)
return [p for p in (p.strip() for p in parts) if p]
def in_vocab(model, word: str) -> bool:
if model is None or not word:
return False
return word in model.key_to_index
def safe_most_similar(model, positive: List[str], negative: List[str], topn: int = 10):
if model is None:
return [], ["Aucun modèle chargé."]
missing = [w for w in positive + negative if w and not in_vocab(model, w)]
if missing:
return [], [f"Mots hors vocabulaire : {', '.join(missing)}"]
try:
res = model.most_similar(positive=positive or None, negative=negative or None, topn=topn)
return res, []
except Exception as e:
return [], [f"Erreur: {e}"]
def safe_similarity(model, w1: str, w2: str) -> Tuple[Optional[float], List[str]]:
if model is None:
return None, ["Aucun modèle chargé."]
missing = [w for w in [w1, w2] if w and not in_vocab(model, w)]
if missing:
return None, [f"Mots hors vocabulaire : {', '.join(missing)}"]
try:
return float(model.similarity(w1, w2)), []
except Exception as e:
return None, [f"Erreur: {e}"]
def to_df_similar(items: List[Tuple[str, float]]) -> pd.DataFrame:
if not items:
return pd.DataFrame(columns=["mot", "similarité_cosine"])
return pd.DataFrame(items, columns=["mot", "similarité_cosine"])
def tokenizer(text) -> List[str]:
"""Remove ^, lowercase, strip punctuation, tokenize (words 2+ chars)."""
if pd.isna(text):
return []
s = str(text).replace("^", " ").lower()
return re.findall(r"\b\w{2,}\b", s)
def load_uploaded_file(uploaded_file) -> Optional[pd.DataFrame]:
"""Load CSV, Excel, or JSON file from Streamlit upload."""
if uploaded_file is None:
return None
name = (uploaded_file.name or "").lower()
try:
if name.endswith(".csv"):
return pd.read_csv(uploaded_file)
if name.endswith((".xlsx", ".xls")):
return pd.read_excel(uploaded_file)
if name.endswith(".json"):
return pd.read_json(uploaded_file)
except Exception:
pass
return None
# -----------------------------
# Model loading
# -----------------------------
@st.cache_resource(show_spinner=False)
def load_local_kv(path: str, binary: bool):
if KeyedVectors is None:
raise RuntimeError("gensim n'est pas disponible. Installez `gensim`.")
# Heuristique: si l'extension est .kv/.model -> KeyedVectors.load
lower = path.lower()
if lower.endswith(".kv") or lower.endswith(".model"):
return KeyedVectors.load(path)
# Sinon, on tente le format word2vec (txt ou bin)
return KeyedVectors.load_word2vec_format(path, binary=binary)
def _get_local_model_files() -> List[str]:
"""Return paths to .model files in the script's directory."""
root = os.path.dirname(os.path.abspath(__file__))
if not os.path.isdir(root):
return []
return sorted(
os.path.join(root, f)
for f in os.listdir(root)
if f.lower().endswith(".model")
)
# -----------------------------
# UI
# -----------------------------
st.title("Word2Vec Playground")
st.caption("Exploration de modèles d'embeddings word2vec (gensim KeyedVectors) : vecteur d'un mot, similarité, voisins, analogies.")
with st.sidebar:
st.header("1) Choisir un modèle")
model_files = _get_local_model_files()
if not model_files:
st.warning("Aucun fichier .model trouvé dans le répertoire du script.")
st.session_state["kv_model"] = None
st.session_state["kv_label"] = None
else:
labels = [os.path.basename(p) for p in model_files]
selected_label = st.radio("Modèle", labels, index=0)
selected_path = model_files[labels.index(selected_label)]
if st.session_state.get("kv_model_path") != selected_path:
with st.spinner(f"Chargement de {selected_label}…"):
try:
model = load_local_kv(selected_path, binary=False)
st.session_state["kv_model"] = model
st.session_state["kv_label"] = selected_label
st.session_state["kv_model_path"] = selected_path
except Exception as e:
st.error(str(e))
st.session_state["kv_model"] = None
st.session_state["kv_label"] = None
st.session_state["kv_model_path"] = None
model = st.session_state.get("kv_model")
model_label = st.session_state.get("kv_label")
# Header model info
info_cols = st.columns([2, 2, 3])
with info_cols[0]:
st.metric("Modèle", model_label or "—")
with info_cols[1]:
if model is None:
st.metric("Vocabulaire", "—")
else:
st.metric("Vocabulaire", f"{len(model.key_to_index):,}".replace(",", " "))
with info_cols[2]:
if model is None:
st.metric("Dimension", "—")
else:
st.metric("Dimension", f"{model.vector_size}")
st.divider()
# Tabs for intro operations
tab1, tab2, tab3, tab4, tab5 = st.tabs([
"Vecteur d'un mot",
"Similarité (cosine)",
"Mots les plus similaires",
"Analogie sémantique",
"Modèle word2vec local",
])
with tab1:
st.subheader("Obtenir le vecteur d'un mot")
st.write("Donne le vecteur (embedding) associé à un mot, et quelques stats simples.")
colA, colB = st.columns([2, 3])
with colA:
word = st.text_input("Mot", value="queen")
show_n = st.slider("Afficher les n premières dimensions", 5, 50, 15)
with colB:
if model is None:
st.info("Charger un modèle dans la barre latérale.")
else:
if not in_vocab(model, word):
st.warning("Mot hors vocabulaire pour ce modèle.")
else:
vec = model[word]
st.write(f"Norme L2: `{np.linalg.norm(vec):.4f}` | min: `{vec.min():.4f}` | max: `{vec.max():.4f}`")
st.code(np.array2string(vec[:show_n], precision=4, separator=", "), language="text")
with tab2:
st.subheader("Mesurer la similarité entre 2 mots (cosine)")
colA, colB, colC = st.columns([2, 2, 2])
with colA:
w1 = st.text_input("Mot 1", value="paris")
with colB:
w2 = st.text_input("Mot 2", value="book")
with colC:
topn = st.slider("Top-N (optionnel pour contexte)", 3, 20, 8)
if model is None:
st.info("Charge un modèle dans la barre latérale.")
else:
sim, errs = safe_similarity(model, w1, w2)
if errs:
for e in errs:
st.warning(e)
else:
st.success(f"Similarité cosine({w1}, {w2}) = **{sim:.4f}**")
# petit bonus : montre les voisins de chaque mot
with st.expander("Voir les voisins de chaque mot"):
a, _ = safe_most_similar(model, [w1], [], topn=topn)
b, _ = safe_most_similar(model, [w2], [], topn=topn)
c1, c2 = st.columns(2)
with c1:
st.write(f"Voisins de **{w1}**")
st.dataframe(to_df_similar(a), use_container_width=True, hide_index=True)
with c2:
st.write(f"Voisins de **{w2}**")
st.dataframe(to_df_similar(b), use_container_width=True, hide_index=True)
with tab3:
st.subheader("Trouver les mots les plus similaires à un mot donné")
colA, colB = st.columns([2, 2])
with colA:
query = st.text_input("Mot cible", value="queen")
with colB:
topn = st.slider("Top-N", 3, 50, 10)
if model is None:
st.info("Charge un modèle dans la barre latérale.")
else:
res, errs = safe_most_similar(model, [query], [], topn=topn)
if errs:
for e in errs:
st.warning(e)
else:
st.dataframe(to_df_similar(res), use_container_width=True, hide_index=True)
with tab4:
st.subheader("Analogie sémantique (positive - negative)")
st.write(
"Exemple classique : `king - man + woman ≈ queen`.\n\n"
"Saisir plusieurs mots (séparés par virgule ou espaces)."
)
colA, colB, colC = st.columns([2, 2, 1])
with colA:
positive_s = st.text_area("Positive (+)", value="king, woman", height=80)
with colB:
negative_s = st.text_area("Negative (-)", value="man", height=80)
with colC:
topn = st.slider("Top-N", 3, 20, 5)
positive = parse_word_list(positive_s)
negative = parse_word_list(negative_s)
if model is None:
st.info("Charge un modèle dans la barre latérale.")
else:
res, errs = safe_most_similar(model, positive, negative, topn=topn)
if errs:
for e in errs:
st.warning(e)
else:
st.dataframe(to_df_similar(res), use_container_width=True, hide_index=True)
with tab5:
st.subheader("Modèle Word2Vec local depuis CSV / Excel / JSON")
st.write(
"Uploader un fichier, sélectionner les colonnes à encoder, "
"entraîner un Word2Vec et visualiser les embeddings (PCA)."
)
uploaded = st.file_uploader(
"Fichier CSV, Excel ou JSON",
type=["csv", "xlsx", "xls", "json"],
key="w2v_local_upload",
)
df_upload = load_uploaded_file(uploaded)
if uploaded is not None and df_upload is None:
st.error("Impossible de charger le fichier. Vérifie le format (CSV, Excel, JSON).")
elif df_upload is not None and not df_upload.empty:
st.dataframe(df_upload.head(100), use_container_width=True, hide_index=True)
st.caption(f"Aperçu des {min(100, len(df_upload))} premières lignes sur {len(df_upload)}.")
text_cols = [c for c in df_upload.columns if df_upload[c].dtype == "object" or df_upload[c].dtype.name == "string"]
if not text_cols:
text_cols = list(df_upload.columns)
selected_cols = st.multiselect(
"Colonnes à encoder (concaténées)",
options=text_cols,
default=text_cols[:1] if text_cols else [],
key="w2v_local_cols",
)
col_a, col_b = st.columns(2)
with col_a:
vector_size = st.slider("Dimension des vecteurs", 50, 300, 100)
min_count = st.slider("min_count", 1, 10, 2)
with col_b:
window = st.slider("window", 2, 15, 5)
sg = st.radio("Algorithme", ["CBOW (sg=0)", "Skip-gram (sg=1)"], index=1, horizontal=True)
if selected_cols and st.button("Entraîner Word2Vec et visualiser PCA", type="primary", key="w2v_train_btn"):
concat = df_upload[selected_cols].fillna("").astype(str).agg(" ".join, axis=1)
tokenized = concat.apply(tokenizer)
tokenized = [t for t in tokenized if t]
if len(tokenized) < 2:
st.warning("Pas assez de texte tokenisé. Vérifie les colonnes sélectionnées.")
elif Word2Vec is None:
st.error("gensim Word2Vec non disponible.")
elif PCA is None:
st.error("scikit-learn PCA non disponible.")
elif px is None:
st.error("plotly non disponible.")
else:
with st.spinner("Entraînement Word2Vec…"):
w2v = Word2Vec(
sentences=tokenized,
vector_size=vector_size,
window=window,
min_count=min_count,
sg=1 if "Skip-gram" in sg else 0,
workers=4,
)
words = list(w2v.wv.index_to_key)
if len(words) < 2:
st.warning("Vocabulaire trop petit après entraînement.")
else:
vectors = np.array([w2v.wv[w] for w in words])
with st.spinner("Réduction PCA…"):
pca = PCA(n_components=2)
reduced = pca.fit_transform(vectors)
fig = px.scatter(
x=reduced[:, 0],
y=reduced[:, 1],
text=words,
title="Embeddings Word2Vec (PCA 2D)",
)
fig.update_traces(textposition="top center", mode="markers+text", textfont_size=10)
fig.update_layout(
xaxis_title="PC1",
yaxis_title="PC2",
height=600,
showlegend=False,
)
st.plotly_chart(fig, use_container_width=True)
st.caption(f"Vocabulaire : {len(words)} mots | variance expliquée : {pca.explained_variance_ratio_.sum():.1%}")
elif df_upload is not None and df_upload.empty:
st.warning("Le fichier est vide.")
else:
st.info("Uploader un fichier CSV, Excel ou JSON pour commencer.")
st.divider()
with st.expander("Idées d'usage 'métier'"):
st.markdown(
"""
- **Enrichir un vocabulaire** : donner un terme (ex. *biologie*, *archives*, *catalogage*) et explorer les termes voisins.
- **Détecter des variantes** : synonymes, quasi-synonymes, noms propres proches, formes dérivées.
- **Comparaison de modèles** : utiliser 2 modèles successivement et comparer les différences : l'espace dépend du corpus !
"""
)
st.divider()
with st.expander("Code source"):
code = '''
#!uv pip install -U gensim
import gensim.downloader as api
#======================================================
# Liste des modèles disponibles
print(list(api.info()['models'].keys()))
#['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
#======================================================
# Charger un modèle
model = api.load("glove-wiki-gigaword-50")
# Obtenir le vecteur d'un mot
vec = model["queen"]
print(vec)
# Obtenir les voisins d'un mot
neighbors = model.most_similar("queen")
print(neighbors)
# Obtenir la similarité entre 2 mots
similarity = model.similarity("queen", "woman")
print(similarity)
# Obtenir les analogies sémantiques
analogies = model.most_similar(positive=["king", "woman"], negative=["man"])
print(analogies)
'''
st.code(code, language="python")