Spaces:
Build error
Build error
| from tkinter.ttk import setup_master | |
| from turtle import color | |
| from sklearn.model_selection import PredefinedSplit | |
| import streamlit as st | |
| # UTILITY | |
| import joblib | |
| import pickle | |
| from joblib import load | |
| # NLP | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk import SnowballStemmer | |
| from gensim.models.doc2vec import Doc2Vec | |
| import numpy as np | |
| comment = "" | |
| tresh1 = 0.500 | |
| tresh2 = 0.937 | |
| tresh3 = 0.999 | |
| # set page setting | |
| st.set_page_config(page_title='Toxic Comments') | |
| # set history var | |
| if 'history' not in st.session_state: | |
| st.session_state.history = [] | |
| # import similarity (to be cached) | |
| def importModel(filename): | |
| model = load(filename) | |
| return model | |
| def importD2V(filename): | |
| model = Doc2Vec.load(filename) | |
| return model | |
| def loadPickle(filename): | |
| file = pickle.load(open(filename, 'rb')) | |
| return file | |
| def loadPunkt(): | |
| nltk.download('punkt') | |
| with st.spinner('Loading the models, this could take some time...'): | |
| loadPunkt() | |
| normalizer = importModel("normalizerD2V.joblib") | |
| classifier = importModel("toxicCommModel.joblib") | |
| model_d2v = importD2V("d2v_comments.model") | |
| # REGEX | |
| def apply_regex(corpus): | |
| corpus = re.sub("\S*\d\S*"," ", corpus) | |
| corpus = re.sub("\S*@\S*\s?"," ", corpus) | |
| corpus = re.sub("\S*#\S*\s?"," ", corpus) | |
| corpus = re.sub(r'http\S+', ' ', corpus) | |
| corpus = re.sub(r'[^a-zA-Z0-9 ]', ' ',corpus) | |
| corpus = corpus.replace(u'\ufffd', '8') | |
| corpus = re.sub(' +', ' ', corpus) | |
| return corpus | |
| # TOKENIZE TEXT - we use the Spacy library stopwords | |
| stop_words = loadPickle("stop_words.pkl") | |
| # TOKENIZE TEXT and STOP WORDS REMOVAL - execution (removes also the words shorter than 2 and longer than 15 chars) | |
| def tokenize(doc): | |
| tokens_1 = word_tokenize(str(doc)) | |
| return [word.lower() for word in tokens_1 if len(word) > 1 and len(word) < 15 and word not in stop_words and not word.isdigit()] | |
| # STEMMING | |
| stemmer = SnowballStemmer(language="english") | |
| def applyStemming(listOfTokens): | |
| return [stemmer.stem(token) for token in listOfTokens] | |
| # PROBS TO CLASS | |
| def probs_to_prediction(probs, threshold): | |
| pred=[] | |
| for x in probs[:,1]: | |
| if x>=threshold: | |
| pred = 1 | |
| else: | |
| pred = 0 | |
| return pred | |
| # PROCESSING | |
| def compute(comment, tresh): | |
| global preds | |
| global probs | |
| global stems | |
| stems = "" | |
| preds = [] | |
| comment = apply_regex(comment) | |
| comment = tokenize(comment) | |
| comment = applyStemming(comment) | |
| stems = comment | |
| vectorizedComment = model_d2v.infer_vector(comment, epochs=70) | |
| normComment = normalizer.transform([vectorizedComment]) | |
| probs = classifier.predict_proba(normComment) | |
| for t in tresh: | |
| preds.append(probs_to_prediction(probs, t)) | |
| with st.container(): | |
| col1, col2, col6 = st.columns(3) | |
| #col1.metric("Toxic", round(preds[0][1], 4)) | |
| #col2.metric("Non Toxic", round(1-preds[0][1], 4)) | |
| col1.metric("Toxic", round(probs[0][1], 4)) | |
| col2.metric("", "") | |
| col6.metric("Non Toxic", round(probs[0][0], 4)) | |
| st.markdown("""---""") | |
| display() | |
| return None | |
| def display(): | |
| with st.container(): | |
| st.write("#### Different classification outputs at different threshold values:") | |
| col3, col4, col5 = st.columns(3) | |
| col3.metric("", "TOXIC" if preds[0]==1 else "NON TOXIC", delta = 0.500) | |
| col4.metric("", "TOXIC" if preds[1]==1 else "NON TOXIC", delta = 0.937) | |
| col5.metric("", "TOXIC" if preds[2]==1 else "NON TOXIC", delta = 0.999) | |
| st.markdown("""---""") | |
| with st.container(): | |
| st.write("#### Result of the NLP Pipeline:") | |
| st.write(stems) | |
| return None | |
| # TITLE | |
| st.write("# ☢️ Toxic Comments Classification ☢️") | |
| st.write("#### Drop a comment and wait for toxicity.") | |
| # INPUT TEXTBOX | |
| comment = st.text_area('', "Drop your comment here! 😎") | |
| # IMPUT THRESHOLD | |
| #tresh = st.slider('Set the Threshold, default 0.5', 0.000, 1.000, step=0.0001, value=0.500) | |
| compute(comment, [tresh1, tresh2, tresh3]) | |
| # sidebar | |
| st.sidebar.write(""" | |
| This is a Toxic Comment Classifier that uses tokenization, stemming, Doc2Vec encoding and tuned logistic regression model. | |
| It's been trained on a large corpus of comments. | |
| A threshold is used to convert the predicted probability of toxicity into a categorical class [toxic, non toxic]. | |
| The value of the threshold can be chosen accordingly to the final application of the classifier. | |
| Here are presented three sample thresholds to see the differences on the final output. | |
| """) |