import streamlit as st import pandas as pd import numpy as np from transformers import AutoTokenizer, AutoModel import torch from sklearn.manifold import TSNE import matplotlib.pyplot as plt from sklearn.metrics.pairwise import cosine_similarity # Load a pre-trained model and tokenizer model_name = "sentence-transformers/all-MiniLM-L6-v2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) # Function to get embedding def get_embedding(text): inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) return outputs.last_hidden_state.mean(dim=1).squeeze().numpy() # Function to classify text def classify_text(embedding, embeddings_df, threshold=0.5): similarities = cosine_similarity([embedding], embeddings_df.iloc[:, 1:-1].values)[0] # Exclude 'recipe_id' and 'label' max_similarity = np.max(similarities) if max_similarity < threshold: return "neither" return embeddings_df['label'].iloc[np.argmax(similarities)] # Streamlit app st.title('Biryani, Pizza, or Neither Classifier') # Load the embeddings and labels DataFrame df = pd.read_csv("embeddings_receipes_final.csv") # Check if the DataFrame is loaded correctly if df.shape[1] < 385: # 384 embeddings + 1 label + 1 recipe_id st.error(f"Expected DataFrame with 385 columns, but got less than that. Please check your CSV file.") else: # Input text input_text = st.text_area("Enter text to classify") if st.button("Classify"): if input_text: # Get the embedding for the input text embedding = get_embedding(input_text) # Ensure the embedding is of the correct dimension if embedding.shape[0] != 384: st.error(f"Expected embedding of dimension 384, but got {embedding.shape[0]}.") else: # Classify the input text using existing embeddings DataFrame `df` predicted_label = classify_text(embedding, df) # Display the result st.write(f"The predicted label is: **{predicted_label}**") # Visualization using t-SNE embeddings = df.iloc[:, 1:-1].values # Exclude 'recipe_id' and 'label' labels = df['label'] tsne = TSNE(n_components=2, random_state=42) tsne_embeddings = tsne.fit_transform(embeddings) # Plotting t-SNE plt.figure(figsize=(10, 6)) for label in np.unique(labels): indices = labels == label plt.scatter(tsne_embeddings[indices, 0], tsne_embeddings[indices, 1], label=label) # Project the input text embedding into the existing t-SNE space tsne_input = TSNE(n_components=2, init='pca', random_state=42) tsne_input_embedding = tsne_input.fit_transform(np.vstack([embeddings, embedding]))[-1] plt.scatter(tsne_input_embedding[0], tsne_input_embedding[1], label='input text', c='red', marker='x', s=100) plt.legend() plt.title('2D t-SNE Visualization of Embeddings') plt.xlabel('t-SNE Dimension 1') plt.ylabel('t-SNE Dimension 2') st.pyplot(plt) else: st.write("Please enter text to classify.")