Spaces:

NLPSentimentMedan
/

Gladiator2

Sleeping

App Files Files Community

Gladiator2 / src /streamlit_app.py

jeaneaprill

Add Models and Tokenizers, Revise requirements.txt alongside modifying streamlit code

d302747 8 months ago

raw

history blame contribute delete

8.31 kB

	import streamlit as st
	import numpy as np
	import re, emoji, string, os
	os.environ['TF_USE_LEGACY_KERAS'] = '1'
	from transformers import TFAutoModel, AutoTokenizer
	import tensorflow as tf

	def clear_emoji(text):
	return emoji.replace_emoji(text, ' ')

	def casefold(text):
	return text.lower()

	def replace_punctuations(text):
	punctuations = set(string.punctuation)
	for char in text:
	if char in punctuations:
	text = text.replace(char, ' ')
	return text

	def tear_Is(text):
	words = ['that', 'this', 'there', 'he', 'she', 'it', 'what', 'who', 'when', 'where', 'how', 'everyone']
	apostrophe = "'"
	for word in words:
	if str(word+apostrophe+'s') in text:
	text = text.replace(str(word+apostrophe+'s'), str(word+' '+'is'))
	return text

	def first_clean(text):
	text = clear_emoji(text)
	text = casefold(text)
	text = re.sub(r'\b\d+K{1}\b', ' thousand ', text, flags=re.IGNORECASE)
	text = re.sub(r'[0-9]+', ' ', text) # remove numbers
	text = re.sub(r"http\S+", ' ', text) # remove links with http
	text = re.sub(r"www.+", ' ', text) # remove links with www
	text = re.sub(r'#[a-zA-Z0-9]+', ' ', text) # remove hashtags
	text = re.sub(r'@[a-zA-Z0-9]+', ' ', text) # remove mentions
	text = text.replace("'", "'")
	text = text.replace("’", "'")
	text = text.replace("´", "'")
	text = text.replace("'d", " had")
	text = text.replace("-", " ")
	text = text.replace('\n', ' ') # replace new line into space
	return text

	def second_clean(text):
	text = tear_Is(text)
	text = text.strip()
	text = text.replace(" the f ", ' the fuck ')
	text = text.replace(" s ", ' ass ')
	text = text.replace("f*ed", "fucked")
	text = text.replace("f**ed", "fucked")
	text = text.replace("f*ck", 'fuck')
	text = text.replace("fck ", 'fuck')
	text = text.replace("f*n", 'fucking')
	text = text.replace("fckn", 'fucking')
	text = text.replace("f*in", 'fucking')
	text = text.replace("sh*t", "shit")
	text = text.replace("sh*te", "shit")
	text = text.replace("s**t", "shit")
	text = text.replace("lol", "laugh out loud")
	text = text.replace("wuz", "was")
	text = text.replace(" wanna ", " want to ")
	text = text.replace(" won't ", " will not ")
	text = text.replace(" wont ", " will not ")
	text = text.replace(" isn't ", ' is not')
	text = text.replace(" ii ", ' two ') # replace ai'nt with not
	text = text.replace("yall", 'you all') # replace ai'nt with not
	text = text.replace("y'all", 'you all') # replace ai'nt with not
	text = text.replace("let's", 'let us') # replace ai'nt with not
	text = text.replace("thats", "that is")
	text = text.replace("lets", 'let us') # replace ai'nt with not
	text = text.replace("ain't", 'not') # replace ai'nt with not
	text = text.replace("aint", 'not') # replace ai'nt with not
	text = text.replace("can't", 'can not') # replace ai'nt with not
	text = text.replace("n't", ' not') # replace n't with not
	text = text.replace("i'm", 'i am') # replace i'm with i am
	text = text.replace(" i'm", 'i am') # replace i'm with i am
	text = text.replace(" dont ", 'do not') # replace i'm with i am
	text = text.replace("didnt", 'did not') # replace i'm with i am
	text = text.replace("doesnt", 'does not') # replace i'm with i am
	text = text.replace(" isnt ", ' is not ') # replace i'm with i am
	text = text.replace(" cant ", ' can not ') # replace i'm with i am
	text = text.replace(" im ", 'i am') # replace im with i am
	text = text.replace("'re", ' are') # replace 're with are
	text = text.replace("'ll", ' will') # replace 're with are
	text = text.replace("'ve", ' have') # replace 're with are
	text = text.replace(" da ", " the ") # replace da with the
	text = text.replace(" imo ", ' in my opinion ')
	text = text.replace(" og ", ' original ')
	text = text.replace(" ya ", ' you ')
	text = text.replace(" ppl ", " people ")
	text = text.replace(" nota ", " not a ")
	text = text.replace(" cuz ", " cause ")
	text = text.replace(" wth ", " what the heck ")
	text = text.replace("f*k", "fuck")
	text = text.replace("f k", "fuck")
	text = text.replace("d*k", "dick")
	text = text.replace(" i m ", " i am ")
	text = text.replace(" gg ", " glory glory ")
	text = text.replace(" btw ", " by the way ")
	text = text.replace(" ill ", " i will ")
	text = text.replace(" af ", " as fuck ")
	text = text.replace(" idk ", " i do not know ")
	text = text.replace("ffs", "for fuck sake")
	text = text.replace(" tho ", " though ")
	text = text.replace(" tf ", " the fuck ")
	text = text.replace(" bs ", " bullshit ")
	text = text.replace(" smh ", " shaking my head ")
	text = text.replace(" dei ", " diversity, equity, and inclusion ")
	text = text.replace("f*cked", 'fucked')
	text = text.replace("f*ked", 'fucked')
	text = text.replace("tha f", 'the fuck')
	return text

	def third_clean(text):
	text = replace_punctuations(text) # remove all punctuations
	text = text.replace(" rip ", ' rest in peace ')
	text = text.replace(" im ", " i am ")
	text = text.replace(" don t ", " do not ")
	text = text.replace(" iwill ", " i will ")
	text = text.replace(" st ", " first ")
	text = text.replace(" u ", " you ")
	text = text.replace(" the f ", ' the fuck ')
	text = text.replace(" f ck ", ' fuck ')
	text = text.replace(" f ck ", ' fuck ')
	text = text.replace(" f k ", ' fuck ')
	text = text.replace(" f king", ' fucking ')
	text = text.replace(" f it ", ' fuck it ')
	text = text.strip() # delete space at the start and end of string
	text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # remove special characters
	text = ' '.join(text.split())
	return text

	def complete_clean(text):
	text = first_clean(text)
	text = second_clean(text)
	text = third_clean(text)
	text = text.strip()
	return text

	def tokenize(texts, tokenizer, max_length=512):
	encoded = tokenizer(
	list(texts),
	padding=True,
	truncation=True,
	max_length=max_length,
	return_tensors="tf"
	)
	return encoded

	def predict(sentence, model, tokenizer, distilbert=False):
	labels = ['Negative', 'Neutral', 'Positive']
	sentence = complete_clean(sentence)
	sentence = np.array([sentence])
	tokenized = tokenize(texts=sentence, tokenizer=tokenizer)
	predictions = None
	if distilbert:
	predictions = model([
	tokenized['input_ids'],
	tokenized['attention_mask']
	])
	else:
	predictions = model([
	tokenized['input_ids'],
	tokenized['token_type_ids'],
	tokenized['attention_mask']
	])
	predictions = np.array(predictions[0])
	label = labels[np.argmax(predictions)]
	confidence = np.max(predictions)*100
	return label, confidence

	st.title("Sentiment Analysis with HuggingFace Spaces")

	option = st.selectbox(
	'Choose a Model:',
	['ALBERT-Base', 'DistilBERT-Base', 'BERT-Base']
	)

	directory_dict = {
	"ALBERT-Base":["src/models/albert_base/albert_tokenizer", "src/models/albert_base/albert_sentiment_model"],
	"DistilBERT-Base":["src/models/distilbert/distilbert_tokenizer", "src/models/distilbert/distilbert_sentiment_model"],
	"BERT-Base":["src/models/bert_base/bert_base_tokenizer", "src/models/bert_base/bert_base_sentiment_model"]
	}

	chosen_model = directory_dict[option]
	if option == 'DistilBERT-Base':
	distilbert=True
	else:
	distilbert=False
	loaded_tokenizer = AutoTokenizer.from_pretrained(chosen_model[0], local_files_only=True)
	loaded_model = tf.saved_model.load(
	chosen_model[1]
	)

	st.write("Enter a sentence to analyze its sentiment:")

	user_input = st.text_input("")
	if user_input:
	result, confidence = predict(user_input, model=loaded_model, tokenizer=loaded_tokenizer, distilbert=distilbert)
	st.write(f"Model Chosen : {option}")
	st.write(f"Sentiment : {result}")
	st.write(f"Confidence : {confidence:.2f}%")