| | import streamlit as st |
| | import torch |
| | import numpy as np |
| | import pandas as pd |
| | from transformers import AutoTokenizer, AutoModel, pipeline |
| | from typing import Optional, Tuple, Dict, Any, List |
| | import json |
| |
|
| | try: |
| | from sklearn.decomposition import PCA |
| | except ImportError: |
| | PCA = None |
| | try: |
| | import plotly.express as px |
| | except ImportError: |
| | px = None |
| |
|
| | st.set_page_config(page_title="BERT – Tokenizer & Embeddings Demo", layout="wide") |
| |
|
| | st.title("BERT – Architecture, Tokenizer, ID↔Token, Fill-Mask, Embeddings, PCA Map") |
| |
|
| | |
| | |
| | |
| |
|
| | def _device() -> torch.device: |
| | return torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
|
| |
|
| | def count_params(model: torch.nn.Module) -> Tuple[int, int]: |
| | total = sum(p.numel() for p in model.parameters()) |
| | trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| | return total, trainable |
| |
|
| |
|
| | def safe_json(obj: Any) -> str: |
| | try: |
| | return json.dumps(obj, indent=2, ensure_ascii=False, default=str) |
| | except Exception: |
| | return str(obj) |
| | |
| | |
| | |
| | |
| |
|
| | st.sidebar.header("⚙️ Settings") |
| | model_name = st.sidebar.text_input("Hugging Face model name", value="google-bert/bert-base-uncased") |
| | use_hidden_states = st.sidebar.checkbox("output_hidden_states", value=False) |
| | max_vocab_rows = st.sidebar.slider("Rows per page (vocab viewer)", 50, 2000, 500, step=50) |
| |
|
| | device = _device() |
| | st.sidebar.write("Device:", str(device)) |
| |
|
| |
|
| | @st.cache_resource(show_spinner=False) |
| | def load_tokenizer_and_model(model_name: str, output_hidden_states: bool): |
| | tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) |
| | mdl = AutoModel.from_pretrained(model_name, output_hidden_states=output_hidden_states) |
| | mdl.eval() |
| | return tok, mdl |
| |
|
| |
|
| | @st.cache_resource(show_spinner=False) |
| | def load_fill_mask(model_name: str): |
| | |
| | return pipeline("fill-mask", model=model_name) |
| |
|
| |
|
| | with st.spinner("Loading tokenizer + model…"): |
| | tokenizer, model = load_tokenizer_and_model(model_name, use_hidden_states) |
| | model = model.to(device) |
| |
|
| | |
| | |
| | |
| | |
| | TAB_LABELS = [ |
| | "Architecture", |
| | "Tokenizer vocab", |
| | "ID ↔ Token", |
| | "Fill-mask", |
| | "Embeddings output", |
| | "Embeddings map", |
| | ] |
| |
|
| | if "active_tab" not in st.session_state: |
| | st.session_state["active_tab"] = TAB_LABELS[0] |
| |
|
| | active_tab = st.radio( |
| | "Section", |
| | TAB_LABELS, |
| | index=TAB_LABELS.index(st.session_state["active_tab"]), |
| | horizontal=True, |
| | key="main_tab_selector", |
| | ) |
| | st.session_state["active_tab"] = active_tab |
| |
|
| | |
| | |
| | |
| |
|
| | if active_tab == "Architecture": |
| | col1, col2 = st.columns([1, 1]) |
| |
|
| | with col1: |
| | st.subheader("Infos générales") |
| | total, trainable = count_params(model) |
| | st.write( |
| | { |
| | "model_id": model_name, |
| | "model_class": model.__class__.__name__, |
| | "total_params": total, |
| | "trainable_params": trainable, |
| | "dtype": str(next(model.parameters()).dtype), |
| | "device": str(next(model.parameters()).device), |
| | } |
| | ) |
| |
|
| | st.subheader("model.eval()") |
| | st.write("✅ Le modèle est en mode évaluation (`eval()`).") |
| |
|
| | st.subheader("config (model.config)") |
| | try: |
| | cfg = model.config.to_dict() |
| | except Exception: |
| | cfg = vars(model.config) |
| | st.code(safe_json(cfg), language="json") |
| |
|
| | with col2: |
| | st.subheader("Architecture (str(model))") |
| | |
| | model_str = str(model) |
| | if len(model_str) > 12000: |
| | model_str = model_str[:12000] + "\n...\n[tronqué]" |
| | st.code(model_str) |
| |
|
| | st.subheader("Couche d’input embeddings") |
| | try: |
| | emb_layer = model.get_input_embeddings() |
| | w = emb_layer.weight |
| | st.write( |
| | { |
| | "embedding_weight_shape": list(w.shape), |
| | "vocab_size (weight)": int(w.shape[0]), |
| | "hidden_dim": int(w.shape[1]), |
| | } |
| | ) |
| | except Exception as e: |
| | st.warning(f"Impossible d’accéder à get_input_embeddings(): {e}") |
| |
|
| | |
| | |
| | |
| | if active_tab == "Tokenizer vocab": |
| | st.subheader("Tokenizer vocabulary") |
| | st.write({"len(tokenizer)": len(tokenizer), "model": model_name}) |
| |
|
| | |
| | total = len(tokenizer) |
| | if total == 0: |
| | st.warning("Tokenizer vocabulary appears empty.") |
| | else: |
| | |
| | max_start = max(total - max_vocab_rows, 0) |
| | start = st.slider("Start ID", 0, max_start, min(1000, max_start), step=max_vocab_rows) |
| | end = min(start + max_vocab_rows, total) |
| |
|
| | ids = list(range(start, end)) |
| | |
| | tokens = [tokenizer.decode(i) for i in ids] |
| |
|
| | df = pd.DataFrame({"ID": ids, "token": tokens}) |
| | st.dataframe(df, use_container_width=True, height=520) |
| |
|
| | with st.expander("Special tokens"): |
| | st.write("special_tokens_map:", tokenizer.special_tokens_map) |
| | st.write("all_special_tokens:", getattr(tokenizer, "all_special_tokens", [])) |
| | st.write("all_special_ids:", getattr(tokenizer, "all_special_ids", [])) |
| |
|
| | |
| | |
| | |
| | if active_tab == "ID ↔ Token": |
| | st.subheader("Convert text → ids/tokens and ids → text") |
| |
|
| | text = st.text_area( |
| | "Text to tokenize", |
| | value="Sustainable thermal insulation biocomposites from rice husk", |
| | height=100, |
| | ) |
| |
|
| | enc = tokenizer(text, return_tensors="pt") |
| | ids = enc["input_ids"][0].tolist() |
| | toks = tokenizer.convert_ids_to_tokens(ids) |
| | decoded_list = tokenizer.decode(ids, skip_special_tokens=False) |
| | decoded_clean = tokenizer.decode(ids, skip_special_tokens=True) |
| |
|
| | c1, c2 = st.columns(2) |
| | with c1: |
| | st.markdown("**input_ids**") |
| | st.code(ids) |
| | st.markdown("**tokens**") |
| | st.code(toks) |
| | with c2: |
| | st.markdown("**decode(ids) (keep specials)**") |
| | st.code(decoded_list) |
| | st.markdown("**decode(ids) (skip specials)**") |
| | st.code(decoded_clean) |
| |
|
| | st.divider() |
| | st.subheader("Single conversions") |
| | cc1, cc2 = st.columns(2) |
| |
|
| | with cc1: |
| | st.markdown("**ID → token**") |
| | id_in = st.number_input("ID", min_value=0, max_value=max(len(tokenizer) - 1, 0), value=min(101, max(len(tokenizer) - 1, 0))) |
| | |
| | st.write({"id": int(id_in), "token": tokenizer.decode([int(id_in)])}) |
| |
|
| | with cc2: |
| | st.markdown("**token → ID**") |
| | tok_in = st.text_input("Token (as in vocab, e.g. 'insulation' or '##ing')", value="insulation") |
| | if tok_in: |
| | st.write({"token": tok_in, "id": int(tokenizer.convert_tokens_to_ids(tok_in))}) |
| |
|
| | |
| | |
| | |
| | if active_tab == "Embeddings output": |
| | st.subheader("Model forward → last_hidden_state") |
| |
|
| | text2 = st.text_area( |
| | "Text for embeddings", |
| | value="Sustainable thermal insulation biocomposites from rice husk", |
| | height=90, |
| | ) |
| |
|
| | inputs = tokenizer(text2, return_tensors="pt").to(device) |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| |
|
| | last_hidden = getattr(outputs, "last_hidden_state", None) |
| | if last_hidden is None: |
| | st.warning("This model output has no last_hidden_state (unexpected for AutoModel). Try another model.") |
| | else: |
| | toks = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist()) |
| | emb = last_hidden[0].detach().cpu().numpy() |
| |
|
| | df = pd.DataFrame( |
| | emb, |
| | index=[f"{i} {t}" for i, t in enumerate(toks)], |
| | columns=[f"d{j}" for j in range(emb.shape[1])], |
| | ) |
| | st.dataframe(df, use_container_width=True, height=520) |
| |
|
| | |
| | |
| | |
| | if active_tab == "Embeddings map": |
| | st.subheader("Multi-sentence embeddings → PCA map") |
| | st.write("Enter several sentences (one per line). Embeddings are computed and projected to 2D for visualization.") |
| |
|
| | default_sentences = "Sustainable thermal insulation biocomposites.\nRice husk and natural fibers.\nEnergy-efficient building materials.\nRecycled plastic composites.\nWood fiber insulation." |
| | sentences_text = st.text_area("Sentences (one per line)", value=default_sentences, height=120, key="embed_map_sentences") |
| |
|
| | level = st.radio("Embedding level", ["Token level", "Sentence level"], horizontal=True, key="embed_map_level") |
| |
|
| | if st.button("Compute embeddings and plot", type="primary", key="embed_map_btn"): |
| | lines = [s.strip() for s in sentences_text.strip().split("\n") if s.strip()] |
| | if not lines: |
| | st.warning("Enter at least one sentence.") |
| | elif PCA is None: |
| | st.error("scikit-learn is required for PCA. Install it with `pip install scikit-learn`.") |
| | elif px is None: |
| | st.error("plotly is required. Install it with `pip install plotly`.") |
| | else: |
| | with st.spinner("Computing embeddings…"): |
| | all_embeddings: List[np.ndarray] = [] |
| | all_labels: List[str] = [] |
| |
|
| | for sent in lines: |
| | inputs = tokenizer(sent, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) |
| | with torch.no_grad(): |
| | out = model(**inputs) |
| |
|
| | last_hidden = out.last_hidden_state[0].detach().cpu().numpy() |
| | ids = inputs["input_ids"][0].tolist() |
| | tokens = tokenizer.convert_ids_to_tokens(ids) |
| |
|
| | if level == "Token level": |
| | special = {"[CLS]", "[SEP]", "[PAD]", "<s>", "</s>", "<pad>", "<unk>"} |
| | for i, tok in enumerate(tokens): |
| | if tok in special or (tok.startswith("[") and tok.endswith("]")): |
| | continue |
| | all_embeddings.append(last_hidden[i]) |
| | all_labels.append(f"{tok}|{sent[:20]}…" if len(sent) > 20 else f"{tok}|{sent}") |
| | else: |
| | mask = (inputs["attention_mask"][0].cpu().numpy() == 1) |
| | mask[0] = False |
| | idx = np.where(mask)[0] |
| | if len(idx) >= 2: |
| | mask[idx[-1]] = False |
| | pooled = last_hidden[mask].mean(axis=0) if mask.any() else last_hidden[0] |
| | all_embeddings.append(pooled) |
| | all_labels.append(sent[:80] + "…" if len(sent) > 80 else sent) |
| |
|
| | if len(all_embeddings) < 2: |
| | st.warning("Not enough points to plot (need at least 2). Try more sentences or token-level mode.") |
| | else: |
| | X = np.array(all_embeddings) |
| | pca = PCA(n_components=2) |
| | reduced = pca.fit_transform(X) |
| |
|
| | fig = px.scatter( |
| | x=reduced[:, 0], |
| | y=reduced[:, 1], |
| | text=all_labels, |
| | title="BERT embeddings (PCA 2D)", |
| | ) |
| | fig.update_traces(textposition="top center", mode="markers+text", textfont_size=9) |
| | fig.update_layout( |
| | xaxis_title="PC1", |
| | yaxis_title="PC2", |
| | height=600, |
| | showlegend=False, |
| | ) |
| | st.plotly_chart(fig, use_container_width=True) |
| | st.caption(f"Points: {len(all_labels)} | Variance explained: {pca.explained_variance_ratio_.sum():.1%}") |
| |
|
| | |
| | |
| | |
| | if active_tab == "Fill-mask": |
| | st.subheader("Masked language modeling (pipeline: fill-mask)") |
| | st.caption("For English BERT, use [MASK]. For RoBERTa-like models, mask token differs (e.g. <mask>).") |
| |
|
| | with st.spinner("Loading fill-mask pipeline…"): |
| | fill_mask = load_fill_mask(model_name) |
| |
|
| | mask_token = getattr(fill_mask.tokenizer, "mask_token", "[MASK]") |
| | st.write({"mask_token": mask_token}) |
| |
|
| | default_prompt = f"Peintre officiel de la marine et fondateur de la société {mask_token} des artistes français" |
| | prompt = st.text_area("Prompt with a mask token", value=default_prompt, height=90) |
| |
|
| | top_k = st.slider("top_k", 1, 20, 5) |
| |
|
| | if st.button("Run fill-mask"): |
| | try: |
| | results = fill_mask(prompt, top_k=top_k) |
| | |
| | out_df = pd.DataFrame( |
| | [{"sequence": r.get("sequence"), "score": float(r.get("score", 0.0)), "token_str": r.get("token_str")} for r in results] |
| | ) |
| | st.dataframe(out_df, use_container_width=True, height=300) |
| | except Exception as e: |
| | st.error(f"fill-mask failed: {e}") |
| | st.info("Tip: make sure your prompt uses the right mask token for the selected model.") |