hacp-pt's picture
Documenting
c60bfc4
import streamlit as st
import pandas as pd
import numpy as np
from transformers import pipeline
pipe = pipeline(model="facebook/bart-large-mnli")
st.markdown('# Demo: Review Cause prediction with Zero-shot Classification')
st.markdown('In this demo we show a proof-of-concept of a zero-shot classifier for customer reviews based on possible causes/drivers (price, customer service')
st.markdown('In this first pass we use a zero-shot classifier: a general model that has been trained with a broad set of instances and it is used to predict without explicitly seen our target labels')
st.markdown('We use set of manually classified reviews as our test set and measure performance with log-loss: decreases as performance improves')
st.markdown('The target labels are: positive price, negative price, positive customer service, negative customer service')
st.markdown('## Classifying one review: Notice how the performance improves as we add more details in the description')
review = st.text_input('Review to classify', 'Price is great and staff is very helpful')
label0_pos_title = 'Positive Price'
label0_neg_title = 'Negative Price'
label0_pos_description = st.text_input('Description: Positive Price')
label0_neg_description = st.text_input('Description: Negative Price')
label1_pos_title = 'Positive Customer Service'
label1_neg_title = 'Negative Customer Service'
label1_pos_description = st.text_input('Description: Positive Customer Service')
label1_neg_description = st.text_input('Description: Negative Customer Service')
results_zs = pipe(review,
candidate_labels=[label0_neg_title,
label0_pos_title,
label1_neg_title,
label1_pos_title
],
)
results_zs_description = pipe(review,
candidate_labels=[label0_neg_title + ' <> ' + label0_neg_description,
label0_pos_title + ' <> ' + label0_pos_description,
label1_neg_title + ' <> ' + label1_neg_description,
label1_pos_title + ' <> ' + label1_pos_description
],
)
df_results_zs = pd.DataFrame(np.transpose([results_zs['labels'],results_zs['scores']]),columns = ['labels','scores ZS'])
df_results_zs_description = pd.DataFrame(np.transpose([results_zs_description['labels'],results_zs_description['scores']]),columns = ['labelsD','scores ZS+D'])
def get_labels(labelD):
return(labelD.split(' <> ')[0])
df_results_zs_description['labels'] = df_results_zs_description['labelsD'].apply(get_labels)
df_results = pd.merge(df_results_zs,df_results_zs_description,on='labels')
#st.dataframe(df_results_zs.style.highlight_max(axis=0))
#st.dataframe(df_results_zs_description.style.highlight_max(axis=0))
st.markdown('The results of the classification are listed below: notice how when we improve the definition of our target labels, the predicions improve as well')
st.dataframe(df_results[['labels','scores ZS', 'scores ZS+D']].style.highlight_max(subset = ['scores ZS', 'scores ZS+D'],axis=0))
st.markdown('## Testing against the validation set: Notice how the log loss decreases as we add more details in the target definitions')
data_val = pd.read_csv('data/NPS NLP Zero Shot Validatation - HuggingFaceValidation.csv')
l_scores = []
for review_val in data_val['review']:
results_val = pipe(review_val,
candidate_labels=
[label0_neg_title + ' <> ' + label0_neg_description,
label0_pos_title + ' <> ' + label0_pos_description,
label1_neg_title + ' <> ' + label1_neg_description,
label1_pos_title + ' <> ' + label1_pos_description],)
df_review = pd.DataFrame(np.transpose([results_val['labels'],results_val['scores']]),columns=['labels','scores'])
df_review_sorted = df_review.sort_values('labels')
l_scores.append(np.array(df_review_sorted['scores']))
l_labels = df_results.sort_values('labels')['labels']
df_scores = pd.DataFrame(l_scores,columns=l_labels)
data_val_scores = pd.concat([data_val,df_scores],axis=1)
st.dataframe(data_val_scores)
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import log_loss
y_true = [np.array(row.dropna()).tolist() for index, row in data_val[['label0','label1']].iterrows()]
y_pred = l_scores
mlb = MultiLabelBinarizer()
mlb.fit([l_labels])
y_true_mlb = mlb.transform(y_true)
log_loss_score = log_loss(y_true_mlb,y_pred)
st.metric('log_loss',log_loss_score)
#st.dataframe(data_val)
st.write(results_zs)
st.write(results_zs_description)