Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from transformers import pipeline | |
| pipe = pipeline(model="facebook/bart-large-mnli") | |
| st.markdown('# Demo: Review Cause prediction with Zero-shot Classification') | |
| st.markdown('In this demo we show a proof-of-concept of a zero-shot classifier for customer reviews based on possible causes/drivers (price, customer service') | |
| st.markdown('In this first pass we use a zero-shot classifier: a general model that has been trained with a broad set of instances and it is used to predict without explicitly seen our target labels') | |
| st.markdown('We use set of manually classified reviews as our test set and measure performance with log-loss: decreases as performance improves') | |
| st.markdown('The target labels are: positive price, negative price, positive customer service, negative customer service') | |
| st.markdown('## Classifying one review: Notice how the performance improves as we add more details in the description') | |
| review = st.text_input('Review to classify', 'Price is great and staff is very helpful') | |
| label0_pos_title = 'Positive Price' | |
| label0_neg_title = 'Negative Price' | |
| label0_pos_description = st.text_input('Description: Positive Price') | |
| label0_neg_description = st.text_input('Description: Negative Price') | |
| label1_pos_title = 'Positive Customer Service' | |
| label1_neg_title = 'Negative Customer Service' | |
| label1_pos_description = st.text_input('Description: Positive Customer Service') | |
| label1_neg_description = st.text_input('Description: Negative Customer Service') | |
| results_zs = pipe(review, | |
| candidate_labels=[label0_neg_title, | |
| label0_pos_title, | |
| label1_neg_title, | |
| label1_pos_title | |
| ], | |
| ) | |
| results_zs_description = pipe(review, | |
| candidate_labels=[label0_neg_title + ' <> ' + label0_neg_description, | |
| label0_pos_title + ' <> ' + label0_pos_description, | |
| label1_neg_title + ' <> ' + label1_neg_description, | |
| label1_pos_title + ' <> ' + label1_pos_description | |
| ], | |
| ) | |
| df_results_zs = pd.DataFrame(np.transpose([results_zs['labels'],results_zs['scores']]),columns = ['labels','scores ZS']) | |
| df_results_zs_description = pd.DataFrame(np.transpose([results_zs_description['labels'],results_zs_description['scores']]),columns = ['labelsD','scores ZS+D']) | |
| def get_labels(labelD): | |
| return(labelD.split(' <> ')[0]) | |
| df_results_zs_description['labels'] = df_results_zs_description['labelsD'].apply(get_labels) | |
| df_results = pd.merge(df_results_zs,df_results_zs_description,on='labels') | |
| #st.dataframe(df_results_zs.style.highlight_max(axis=0)) | |
| #st.dataframe(df_results_zs_description.style.highlight_max(axis=0)) | |
| st.markdown('The results of the classification are listed below: notice how when we improve the definition of our target labels, the predicions improve as well') | |
| st.dataframe(df_results[['labels','scores ZS', 'scores ZS+D']].style.highlight_max(subset = ['scores ZS', 'scores ZS+D'],axis=0)) | |
| st.markdown('## Testing against the validation set: Notice how the log loss decreases as we add more details in the target definitions') | |
| data_val = pd.read_csv('data/NPS NLP Zero Shot Validatation - HuggingFaceValidation.csv') | |
| l_scores = [] | |
| for review_val in data_val['review']: | |
| results_val = pipe(review_val, | |
| candidate_labels= | |
| [label0_neg_title + ' <> ' + label0_neg_description, | |
| label0_pos_title + ' <> ' + label0_pos_description, | |
| label1_neg_title + ' <> ' + label1_neg_description, | |
| label1_pos_title + ' <> ' + label1_pos_description],) | |
| df_review = pd.DataFrame(np.transpose([results_val['labels'],results_val['scores']]),columns=['labels','scores']) | |
| df_review_sorted = df_review.sort_values('labels') | |
| l_scores.append(np.array(df_review_sorted['scores'])) | |
| l_labels = df_results.sort_values('labels')['labels'] | |
| df_scores = pd.DataFrame(l_scores,columns=l_labels) | |
| data_val_scores = pd.concat([data_val,df_scores],axis=1) | |
| st.dataframe(data_val_scores) | |
| from sklearn.preprocessing import MultiLabelBinarizer | |
| from sklearn.metrics import log_loss | |
| y_true = [np.array(row.dropna()).tolist() for index, row in data_val[['label0','label1']].iterrows()] | |
| y_pred = l_scores | |
| mlb = MultiLabelBinarizer() | |
| mlb.fit([l_labels]) | |
| y_true_mlb = mlb.transform(y_true) | |
| log_loss_score = log_loss(y_true_mlb,y_pred) | |
| st.metric('log_loss',log_loss_score) | |
| #st.dataframe(data_val) | |
| st.write(results_zs) | |
| st.write(results_zs_description) | |