Spaces:

Daksh0505
/

bert-elmo-embeddings-generator

Runtime error

App Files Files Community

bert-elmo-embeddings-generator / app.py

Daksh0505

Update app.py

e10d8b0 verified 7 months ago

raw

history blame contribute delete

3.85 kB

	import streamlit as st
	from transformers import BertTokenizer, BertModel
	import tensorflow_hub as hub
	import torch
	import tensorflow as tf
	import pandas as pd
	import numpy as np
	import io

	# ---------------- Model options ----------------
	models = {
	"BERT Base Uncased": "bert-base-uncased",
	"BERT Base Cased": "bert-base-cased",
	"BERT Large Uncased": "bert-large-uncased",
	"BERT Large Cased": "bert-large-cased",
	"ELMo": "https://tfhub.dev/google/elmo/3"
	}

	keys = list(models.keys())

	# ---------------- Streamlit UI ----------------
	st.title("📝 Generate Sentence Embeddings (BERT + ELMo)")

	choice = st.selectbox("Choose Model:", options=keys, index=0)

	st.markdown("### Enter a sentence (or upload CSV below):")
	text = st.text_area("", height=150)

	st.markdown("---")
	file = st.file_uploader("📂 Upload a CSV with a 'sentence' column for bulk analysis", type=["csv"])

	# ---------------- Cached loaders ----------------
	@st.cache_resource
	def load_bert(model_name: str):
	tokenizer = BertTokenizer.from_pretrained(model_name)
	model = BertModel.from_pretrained(model_name)
	model.eval()
	return tokenizer, model

	@st.cache_resource
	def load_elmo():
	return hub.KerasLayer(models["ELMo"], trainable=False)

	# ---------------- Load selected model ----------------
	if "BERT" in choice:
	tokenizer, model = load_bert(models[choice])
	st.write(f"✅ Loaded {choice}")
	elif choice == "ELMo":
	elmo = load_elmo()
	st.write("✅ Loaded ELMo")

	# ---------------- Analyze ----------------
	if st.button("🔍 Generate Embeddings") and (text.strip() or file):

	sentences = []
	sources = []

	# Single sentence input
	if text.strip():
	sentences.append(text.strip())
	sources.append("Single sentence")

	# CSV input
	if file:
	df = pd.read_csv(file)
	if "sentence" in df.columns:
	csv_sentences = df["sentence"].dropna().astype(str).tolist()
	sentences.extend(csv_sentences)
	sources.extend(["CSV"] * len(csv_sentences))
	else:
	st.error("CSV must have a 'sentence' column")
	st.stop()

	embeddings = []

	# --------- Generate embeddings ---------
	if "BERT" in choice:
	inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
	with torch.no_grad():
	outputs = model(**inputs)
	embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
	st.success(f"✅ Generated embeddings for {len(sentences)} sentence(s) with {choice}")

	elif choice == "ELMo":
	inputs_tf = tf.convert_to_tensor(sentences, dtype=tf.string)
	emb_tensor = elmo(inputs_tf)
	if isinstance(emb_tensor, dict):
	emb_tensor = emb_tensor["default"]
	embeddings = emb_tensor.numpy()
	st.success(f"✅ Generated embeddings for {len(sentences)} sentence(s) with ELMo")

	embeddings = np.array(embeddings)


	# --------- Show summary ---------
	st.write(f"Source: {', '.join(set(sources))}")
	st.write(f"Model: {choice}")
	st.write(f"Embeddings shape: {embeddings.shape}")

	# --------- Preview embeddings (first 5 rows) ---------
	df_preview = pd.DataFrame(embeddings)
	df_preview.insert(0, "sentence", sentences)
	df_preview.insert(1, "source", sources)

	st.markdown("### 🔹 Preview of Embeddings (first 5 rows)")
	st.dataframe(df_preview.head())


	# --------- Prepare CSV for download (UTF-8) ---------
	df_emb = pd.DataFrame(embeddings)
	df_emb.insert(0, "sentence", sentences)
	df_emb.insert(1, "source", sources)

	csv_data = df_emb.to_csv(index=False, encoding='utf-8')

	st.download_button(
	label="💾 Download Sentences + Embeddings",
	data=csv_data,
	file_name="sentences_embeddings.csv",
	mime="text/csv"
	)