Update app.py
Browse files
app.py
CHANGED
|
@@ -6,8 +6,9 @@ import torch
|
|
| 6 |
from sklearn.manifold import TSNE
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
-
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
|
| 10 |
from scipy.spatial.distance import cosine
|
|
|
|
| 11 |
|
| 12 |
# Load a pre-trained model and tokenizer
|
| 13 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
|
@@ -21,8 +22,8 @@ def get_embedding(text):
|
|
| 21 |
outputs = model(**inputs)
|
| 22 |
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 23 |
|
| 24 |
-
# Function to classify text
|
| 25 |
-
def
|
| 26 |
distances = {label: cosine(embedding, mean_embedding) for label, mean_embedding in mean_embeddings.items()}
|
| 27 |
min_distance = min(distances.values())
|
| 28 |
if min_distance > threshold:
|
|
@@ -30,6 +31,11 @@ def classify_text(embedding, mean_embeddings, threshold=0.5):
|
|
| 30 |
predicted_label = min(distances, key=distances.get)
|
| 31 |
return predicted_label
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Streamlit app
|
| 34 |
st.title('Biryani, Pizza, Milk, Pasta, Potatos, Tomato, or Neither Classifier')
|
| 35 |
|
|
@@ -45,10 +51,16 @@ embeddings = df.iloc[:, 1:-2]
|
|
| 45 |
labels = df['label']
|
| 46 |
mean_embeddings = {label: embeddings[labels == label].mean(axis=0) for label in label_mapping.keys() if label != 'neither'}
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
# Check if the DataFrame is loaded correctly
|
| 49 |
if df.shape[1] < 386: # 384 embeddings + 1 label + 1 recipe_id + 1 label_int
|
| 50 |
st.error(f"Expected DataFrame with 386 columns, but got less than that. Please check your CSV file.")
|
| 51 |
else:
|
|
|
|
|
|
|
|
|
|
| 52 |
# Input text
|
| 53 |
input_text = st.text_area("Enter text to classify")
|
| 54 |
|
|
@@ -61,8 +73,11 @@ else:
|
|
| 61 |
if embedding.shape[0] != 384:
|
| 62 |
st.error(f"Expected embedding of dimension 384, but got {embedding.shape[0]}.")
|
| 63 |
else:
|
| 64 |
-
# Classify the input text using
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Display the result
|
| 68 |
st.write(f"The predicted label is: **{predicted_label}**")
|
|
@@ -93,15 +108,18 @@ else:
|
|
| 93 |
st.pyplot(plt)
|
| 94 |
|
| 95 |
# Generate the confusion matrix
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
conf_matrix = confusion_matrix(df['label_int'], predictions)
|
| 107 |
|
|
|
|
| 6 |
from sklearn.manifold import TSNE
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
+
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
|
| 10 |
from scipy.spatial.distance import cosine
|
| 11 |
+
import joblib
|
| 12 |
|
| 13 |
# Load a pre-trained model and tokenizer
|
| 14 |
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
|
|
|
| 22 |
outputs = model(**inputs)
|
| 23 |
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
|
| 24 |
|
| 25 |
+
# Function to classify text using cosine similarity
|
| 26 |
+
def classify_text_cosine(embedding, mean_embeddings, threshold=0.5):
|
| 27 |
distances = {label: cosine(embedding, mean_embedding) for label, mean_embedding in mean_embeddings.items()}
|
| 28 |
min_distance = min(distances.values())
|
| 29 |
if min_distance > threshold:
|
|
|
|
| 31 |
predicted_label = min(distances, key=distances.get)
|
| 32 |
return predicted_label
|
| 33 |
|
| 34 |
+
# Function to classify text using MLP model
|
| 35 |
+
def classify_text_mlp(embedding, mlp_model):
|
| 36 |
+
prediction = mlp_model.predict([embedding])[0]
|
| 37 |
+
return list(label_mapping.keys())[prediction]
|
| 38 |
+
|
| 39 |
# Streamlit app
|
| 40 |
st.title('Biryani, Pizza, Milk, Pasta, Potatos, Tomato, or Neither Classifier')
|
| 41 |
|
|
|
|
| 51 |
labels = df['label']
|
| 52 |
mean_embeddings = {label: embeddings[labels == label].mean(axis=0) for label in label_mapping.keys() if label != 'neither'}
|
| 53 |
|
| 54 |
+
# Load the MLP model
|
| 55 |
+
mlp_model = joblib.load("mlp_model2.joblib")
|
| 56 |
+
|
| 57 |
# Check if the DataFrame is loaded correctly
|
| 58 |
if df.shape[1] < 386: # 384 embeddings + 1 label + 1 recipe_id + 1 label_int
|
| 59 |
st.error(f"Expected DataFrame with 386 columns, but got less than that. Please check your CSV file.")
|
| 60 |
else:
|
| 61 |
+
# Select classification method
|
| 62 |
+
classification_method = st.selectbox("Select classification method", ["Cosine Similarity", "MLP Model"])
|
| 63 |
+
|
| 64 |
# Input text
|
| 65 |
input_text = st.text_area("Enter text to classify")
|
| 66 |
|
|
|
|
| 73 |
if embedding.shape[0] != 384:
|
| 74 |
st.error(f"Expected embedding of dimension 384, but got {embedding.shape[0]}.")
|
| 75 |
else:
|
| 76 |
+
# Classify the input text using the selected method
|
| 77 |
+
if classification_method == "Cosine Similarity":
|
| 78 |
+
predicted_label = classify_text_cosine(embedding, mean_embeddings)
|
| 79 |
+
else:
|
| 80 |
+
predicted_label = classify_text_mlp(embedding, mlp_model)
|
| 81 |
|
| 82 |
# Display the result
|
| 83 |
st.write(f"The predicted label is: **{predicted_label}**")
|
|
|
|
| 108 |
st.pyplot(plt)
|
| 109 |
|
| 110 |
# Generate the confusion matrix
|
| 111 |
+
if classification_method == "Cosine Similarity":
|
| 112 |
+
predictions = []
|
| 113 |
+
for i, embedding_row in embeddings.iterrows():
|
| 114 |
+
distances = {label: cosine(embedding_row, mean_embeddings[label]) for label in mean_embeddings}
|
| 115 |
+
min_distance = min(distances.values())
|
| 116 |
+
if min_distance > 0.5: # Threshold for "neither"
|
| 117 |
+
predictions.append(label_mapping['neither'])
|
| 118 |
+
else:
|
| 119 |
+
predicted_label = min(distances, key=distances.get)
|
| 120 |
+
predictions.append(label_mapping[predicted_label])
|
| 121 |
+
else:
|
| 122 |
+
predictions = mlp_model.predict(embeddings)
|
| 123 |
|
| 124 |
conf_matrix = confusion_matrix(df['label_int'], predictions)
|
| 125 |
|