Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from transformers import BertTokenizer, BertModel | |
| import tensorflow_hub as hub | |
| import torch | |
| import tensorflow as tf | |
| import pandas as pd | |
| import numpy as np | |
| import io | |
| # ---------------- Model options ---------------- | |
| models = { | |
| "BERT Base Uncased": "bert-base-uncased", | |
| "BERT Base Cased": "bert-base-cased", | |
| "BERT Large Uncased": "bert-large-uncased", | |
| "BERT Large Cased": "bert-large-cased", | |
| "ELMo": "https://tfhub.dev/google/elmo/3" | |
| } | |
| keys = list(models.keys()) | |
| # ---------------- Streamlit UI ---------------- | |
| st.title("π Generate Sentence Embeddings (BERT + ELMo)") | |
| choice = st.selectbox("Choose Model:", options=keys, index=0) | |
| st.markdown("### Enter a sentence (or upload CSV below):") | |
| text = st.text_area("", height=150) | |
| st.markdown("---") | |
| file = st.file_uploader("π Upload a CSV with a 'sentence' column for bulk analysis", type=["csv"]) | |
| # ---------------- Cached loaders ---------------- | |
| def load_bert(model_name: str): | |
| tokenizer = BertTokenizer.from_pretrained(model_name) | |
| model = BertModel.from_pretrained(model_name) | |
| model.eval() | |
| return tokenizer, model | |
| def load_elmo(): | |
| return hub.KerasLayer(models["ELMo"], trainable=False) | |
| # ---------------- Load selected model ---------------- | |
| if "BERT" in choice: | |
| tokenizer, model = load_bert(models[choice]) | |
| st.write(f"β Loaded {choice}") | |
| elif choice == "ELMo": | |
| elmo = load_elmo() | |
| st.write("β Loaded ELMo") | |
| # ---------------- Analyze ---------------- | |
| if st.button("π Generate Embeddings") and (text.strip() or file): | |
| sentences = [] | |
| sources = [] | |
| # Single sentence input | |
| if text.strip(): | |
| sentences.append(text.strip()) | |
| sources.append("Single sentence") | |
| # CSV input | |
| if file: | |
| df = pd.read_csv(file) | |
| if "sentence" in df.columns: | |
| csv_sentences = df["sentence"].dropna().astype(str).tolist() | |
| sentences.extend(csv_sentences) | |
| sources.extend(["CSV"] * len(csv_sentences)) | |
| else: | |
| st.error("CSV must have a 'sentence' column") | |
| st.stop() | |
| embeddings = [] | |
| # --------- Generate embeddings --------- | |
| if "BERT" in choice: | |
| inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state.mean(dim=1).numpy() | |
| st.success(f"β Generated embeddings for {len(sentences)} sentence(s) with {choice}") | |
| elif choice == "ELMo": | |
| inputs_tf = tf.convert_to_tensor(sentences, dtype=tf.string) | |
| emb_tensor = elmo(inputs_tf) | |
| if isinstance(emb_tensor, dict): | |
| emb_tensor = emb_tensor["default"] | |
| embeddings = emb_tensor.numpy() | |
| st.success(f"β Generated embeddings for {len(sentences)} sentence(s) with ELMo") | |
| embeddings = np.array(embeddings) | |
| # --------- Show summary --------- | |
| st.write(f"**Source:** {', '.join(set(sources))}") | |
| st.write(f"**Model:** {choice}") | |
| st.write(f"**Embeddings shape:** {embeddings.shape}") | |
| # --------- Preview embeddings (first 5 rows) --------- | |
| df_preview = pd.DataFrame(embeddings) | |
| df_preview.insert(0, "sentence", sentences) | |
| df_preview.insert(1, "source", sources) | |
| st.markdown("### πΉ Preview of Embeddings (first 5 rows)") | |
| st.dataframe(df_preview.head()) | |
| # --------- Prepare CSV for download (UTF-8) --------- | |
| df_emb = pd.DataFrame(embeddings) | |
| df_emb.insert(0, "sentence", sentences) | |
| df_emb.insert(1, "source", sources) | |
| csv_data = df_emb.to_csv(index=False, encoding='utf-8') | |
| st.download_button( | |
| label="πΎ Download Sentences + Embeddings", | |
| data=csv_data, | |
| file_name="sentences_embeddings.csv", | |
| mime="text/csv" | |
| ) | |