shubham142000's picture
Update app.py
9721b50 verified
import streamlit as st
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
# Load a pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# Function to get embedding
def get_embedding(text):
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
# Function to classify text
def classify_text(embedding, embeddings_df, threshold=0.5):
similarities = cosine_similarity([embedding], embeddings_df.iloc[:, 1:-1].values)[0] # Exclude 'recipe_id' and 'label'
max_similarity = np.max(similarities)
if max_similarity < threshold:
return "neither"
return embeddings_df['label'].iloc[np.argmax(similarities)]
# Streamlit app
st.title('Biryani, Pizza, or Neither Classifier')
# Load the embeddings and labels DataFrame
df = pd.read_csv("embeddings_receipes_final.csv")
# Check if the DataFrame is loaded correctly
if df.shape[1] < 385: # 384 embeddings + 1 label + 1 recipe_id
st.error(f"Expected DataFrame with 385 columns, but got less than that. Please check your CSV file.")
else:
# Input text
input_text = st.text_area("Enter text to classify")
if st.button("Classify"):
if input_text:
# Get the embedding for the input text
embedding = get_embedding(input_text)
# Ensure the embedding is of the correct dimension
if embedding.shape[0] != 384:
st.error(f"Expected embedding of dimension 384, but got {embedding.shape[0]}.")
else:
# Classify the input text using existing embeddings DataFrame `df`
predicted_label = classify_text(embedding, df)
# Display the result
st.write(f"The predicted label is: **{predicted_label}**")
# Visualization using t-SNE
embeddings = df.iloc[:, 1:-1].values # Exclude 'recipe_id' and 'label'
labels = df['label']
tsne = TSNE(n_components=2, random_state=42)
tsne_embeddings = tsne.fit_transform(embeddings)
# Plotting t-SNE
plt.figure(figsize=(10, 6))
for label in np.unique(labels):
indices = labels == label
plt.scatter(tsne_embeddings[indices, 0], tsne_embeddings[indices, 1], label=label)
# Project the input text embedding into the existing t-SNE space
tsne_input = TSNE(n_components=2, init='pca', random_state=42)
tsne_input_embedding = tsne_input.fit_transform(np.vstack([embeddings, embedding]))[-1]
plt.scatter(tsne_input_embedding[0], tsne_input_embedding[1], label='input text', c='red', marker='x', s=100)
plt.legend()
plt.title('2D t-SNE Visualization of Embeddings')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
st.pyplot(plt)
else:
st.write("Please enter text to classify.")