Gladiator2 / src /streamlit_app.py
jeaneaprill's picture
Add Models and Tokenizers, Revise requirements.txt alongside modifying streamlit code
d302747
import streamlit as st
import numpy as np
import re, emoji, string, os
os.environ['TF_USE_LEGACY_KERAS'] = '1'
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
def clear_emoji(text):
return emoji.replace_emoji(text, ' ')
def casefold(text):
return text.lower()
def replace_punctuations(text):
punctuations = set(string.punctuation)
for char in text:
if char in punctuations:
text = text.replace(char, ' ')
return text
def tear_Is(text):
words = ['that', 'this', 'there', 'he', 'she', 'it', 'what', 'who', 'when', 'where', 'how', 'everyone']
apostrophe = "'"
for word in words:
if str(word+apostrophe+'s') in text:
text = text.replace(str(word+apostrophe+'s'), str(word+' '+'is'))
return text
def first_clean(text):
text = clear_emoji(text)
text = casefold(text)
text = re.sub(r'\b\d+K{1}\b', ' thousand ', text, flags=re.IGNORECASE)
text = re.sub(r'[0-9]+', ' ', text) # remove numbers
text = re.sub(r"http\S+", ' ', text) # remove links with http
text = re.sub(r"www.+", ' ', text) # remove links with www
text = re.sub(r'#[a-zA-Z0-9]+', ' ', text) # remove hashtags
text = re.sub(r'@[a-zA-Z0-9]+', ' ', text) # remove mentions
text = text.replace("'", "'")
text = text.replace("’", "'")
text = text.replace("´", "'")
text = text.replace("'d", " had")
text = text.replace("-", " ")
text = text.replace('\n', ' ') # replace new line into space
return text
def second_clean(text):
text = tear_Is(text)
text = text.strip()
text = text.replace(" the f ", ' the fuck ')
text = text.replace(" *s* ", ' ass ')
text = text.replace("f*ed", "fucked")
text = text.replace("f**ed", "fucked")
text = text.replace("f*ck", 'fuck')
text = text.replace("f*ck* ", 'fuck')
text = text.replace("f*n", 'fucking')
text = text.replace("f*ckn*", 'fucking')
text = text.replace("f**in*", 'fucking')
text = text.replace("sh*t", "shit")
text = text.replace("sh*te", "shit")
text = text.replace("s**t", "shit")
text = text.replace("lol", "laugh out loud")
text = text.replace("wuz", "was")
text = text.replace(" wanna ", " want to ")
text = text.replace(" won't ", " will not ")
text = text.replace(" wont ", " will not ")
text = text.replace(" isn't ", ' is not')
text = text.replace(" ii ", ' two ') # replace ai'nt with not
text = text.replace("yall", 'you all') # replace ai'nt with not
text = text.replace("y'all", 'you all') # replace ai'nt with not
text = text.replace("let's", 'let us') # replace ai'nt with not
text = text.replace("thats", "that is")
text = text.replace("lets", 'let us') # replace ai'nt with not
text = text.replace("ain't", 'not') # replace ai'nt with not
text = text.replace("aint", 'not') # replace ai'nt with not
text = text.replace("can't", 'can not') # replace ai'nt with not
text = text.replace("n't", ' not') # replace n't with not
text = text.replace("i'm", 'i am') # replace i'm with i am
text = text.replace(" i'm", 'i am') # replace i'm with i am
text = text.replace(" dont ", 'do not') # replace i'm with i am
text = text.replace("didnt", 'did not') # replace i'm with i am
text = text.replace("doesnt", 'does not') # replace i'm with i am
text = text.replace(" isnt ", ' is not ') # replace i'm with i am
text = text.replace(" cant ", ' can not ') # replace i'm with i am
text = text.replace(" im ", 'i am') # replace im with i am
text = text.replace("'re", ' are') # replace 're with are
text = text.replace("'ll", ' will') # replace 're with are
text = text.replace("'ve", ' have') # replace 're with are
text = text.replace(" da ", " the ") # replace da with the
text = text.replace(" imo ", ' in my opinion ')
text = text.replace(" og ", ' original ')
text = text.replace(" ya ", ' you ')
text = text.replace(" ppl ", " people ")
text = text.replace(" nota ", " not a ")
text = text.replace(" cuz ", " cause ")
text = text.replace(" wth ", " what the heck ")
text = text.replace("f*k", "fuck")
text = text.replace("f k", "fuck")
text = text.replace("d*k", "dick")
text = text.replace(" i m ", " i am ")
text = text.replace(" gg ", " glory glory ")
text = text.replace(" btw ", " by the way ")
text = text.replace(" ill ", " i will ")
text = text.replace(" af ", " as fuck ")
text = text.replace(" idk ", " i do not know ")
text = text.replace("ffs", "for fuck sake")
text = text.replace(" tho ", " though ")
text = text.replace(" tf ", " the fuck ")
text = text.replace(" bs ", " bullshit ")
text = text.replace(" smh ", " shaking my head ")
text = text.replace(" dei ", " diversity, equity, and inclusion ")
text = text.replace("f*cked", 'fucked')
text = text.replace("f*ked", 'fucked')
text = text.replace("tha f", 'the fuck')
return text
def third_clean(text):
text = replace_punctuations(text) # remove all punctuations
text = text.replace(" rip ", ' rest in peace ')
text = text.replace(" im ", " i am ")
text = text.replace(" don t ", " do not ")
text = text.replace(" iwill ", " i will ")
text = text.replace(" st ", " first ")
text = text.replace(" u ", " you ")
text = text.replace(" the f ", ' the fuck ')
text = text.replace(" f ck ", ' fuck ')
text = text.replace(" f ck ", ' fuck ')
text = text.replace(" f k ", ' fuck ')
text = text.replace(" f king", ' fucking ')
text = text.replace(" f it ", ' fuck it ')
text = text.strip() # delete space at the start and end of string
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # remove special characters
text = ' '.join(text.split())
return text
def complete_clean(text):
text = first_clean(text)
text = second_clean(text)
text = third_clean(text)
text = text.strip()
return text
def tokenize(texts, tokenizer, max_length=512):
encoded = tokenizer(
list(texts),
padding=True,
truncation=True,
max_length=max_length,
return_tensors="tf"
)
return encoded
def predict(sentence, model, tokenizer, distilbert=False):
labels = ['Negative', 'Neutral', 'Positive']
sentence = complete_clean(sentence)
sentence = np.array([sentence])
tokenized = tokenize(texts=sentence, tokenizer=tokenizer)
predictions = None
if distilbert:
predictions = model([
tokenized['input_ids'],
tokenized['attention_mask']
])
else:
predictions = model([
tokenized['input_ids'],
tokenized['token_type_ids'],
tokenized['attention_mask']
])
predictions = np.array(predictions[0])
label = labels[np.argmax(predictions)]
confidence = np.max(predictions)*100
return label, confidence
st.title("Sentiment Analysis with HuggingFace Spaces")
option = st.selectbox(
'Choose a Model:',
['ALBERT-Base', 'DistilBERT-Base', 'BERT-Base']
)
directory_dict = {
"ALBERT-Base":["src/models/albert_base/albert_tokenizer", "src/models/albert_base/albert_sentiment_model"],
"DistilBERT-Base":["src/models/distilbert/distilbert_tokenizer", "src/models/distilbert/distilbert_sentiment_model"],
"BERT-Base":["src/models/bert_base/bert_base_tokenizer", "src/models/bert_base/bert_base_sentiment_model"]
}
chosen_model = directory_dict[option]
if option == 'DistilBERT-Base':
distilbert=True
else:
distilbert=False
loaded_tokenizer = AutoTokenizer.from_pretrained(chosen_model[0], local_files_only=True)
loaded_model = tf.saved_model.load(
chosen_model[1]
)
st.write("Enter a sentence to analyze its sentiment:")
user_input = st.text_input("")
if user_input:
result, confidence = predict(user_input, model=loaded_model, tokenizer=loaded_tokenizer, distilbert=distilbert)
st.write(f"Model Chosen : {option}")
st.write(f"Sentiment : {result}")
st.write(f"Confidence : {confidence:.2f}%")