Daksh0505's picture
Update app.py
e10d8b0 verified
import streamlit as st
from transformers import BertTokenizer, BertModel
import tensorflow_hub as hub
import torch
import tensorflow as tf
import pandas as pd
import numpy as np
import io
# ---------------- Model options ----------------
models = {
"BERT Base Uncased": "bert-base-uncased",
"BERT Base Cased": "bert-base-cased",
"BERT Large Uncased": "bert-large-uncased",
"BERT Large Cased": "bert-large-cased",
"ELMo": "https://tfhub.dev/google/elmo/3"
}
keys = list(models.keys())
# ---------------- Streamlit UI ----------------
st.title("πŸ“ Generate Sentence Embeddings (BERT + ELMo)")
choice = st.selectbox("Choose Model:", options=keys, index=0)
st.markdown("### Enter a sentence (or upload CSV below):")
text = st.text_area("", height=150)
st.markdown("---")
file = st.file_uploader("πŸ“‚ Upload a CSV with a 'sentence' column for bulk analysis", type=["csv"])
# ---------------- Cached loaders ----------------
@st.cache_resource
def load_bert(model_name: str):
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.eval()
return tokenizer, model
@st.cache_resource
def load_elmo():
return hub.KerasLayer(models["ELMo"], trainable=False)
# ---------------- Load selected model ----------------
if "BERT" in choice:
tokenizer, model = load_bert(models[choice])
st.write(f"βœ… Loaded {choice}")
elif choice == "ELMo":
elmo = load_elmo()
st.write("βœ… Loaded ELMo")
# ---------------- Analyze ----------------
if st.button("πŸ” Generate Embeddings") and (text.strip() or file):
sentences = []
sources = []
# Single sentence input
if text.strip():
sentences.append(text.strip())
sources.append("Single sentence")
# CSV input
if file:
df = pd.read_csv(file)
if "sentence" in df.columns:
csv_sentences = df["sentence"].dropna().astype(str).tolist()
sentences.extend(csv_sentences)
sources.extend(["CSV"] * len(csv_sentences))
else:
st.error("CSV must have a 'sentence' column")
st.stop()
embeddings = []
# --------- Generate embeddings ---------
if "BERT" in choice:
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
st.success(f"βœ… Generated embeddings for {len(sentences)} sentence(s) with {choice}")
elif choice == "ELMo":
inputs_tf = tf.convert_to_tensor(sentences, dtype=tf.string)
emb_tensor = elmo(inputs_tf)
if isinstance(emb_tensor, dict):
emb_tensor = emb_tensor["default"]
embeddings = emb_tensor.numpy()
st.success(f"βœ… Generated embeddings for {len(sentences)} sentence(s) with ELMo")
embeddings = np.array(embeddings)
# --------- Show summary ---------
st.write(f"**Source:** {', '.join(set(sources))}")
st.write(f"**Model:** {choice}")
st.write(f"**Embeddings shape:** {embeddings.shape}")
# --------- Preview embeddings (first 5 rows) ---------
df_preview = pd.DataFrame(embeddings)
df_preview.insert(0, "sentence", sentences)
df_preview.insert(1, "source", sources)
st.markdown("### πŸ”Ή Preview of Embeddings (first 5 rows)")
st.dataframe(df_preview.head())
# --------- Prepare CSV for download (UTF-8) ---------
df_emb = pd.DataFrame(embeddings)
df_emb.insert(0, "sentence", sentences)
df_emb.insert(1, "source", sources)
csv_data = df_emb.to_csv(index=False, encoding='utf-8')
st.download_button(
label="πŸ’Ύ Download Sentences + Embeddings",
data=csv_data,
file_name="sentences_embeddings.csv",
mime="text/csv"
)