import streamlit as st import numpy as np import re, emoji, string, os os.environ['TF_USE_LEGACY_KERAS'] = '1' from transformers import TFAutoModel, AutoTokenizer import tensorflow as tf def clear_emoji(text): return emoji.replace_emoji(text, ' ') def casefold(text): return text.lower() def replace_punctuations(text): punctuations = set(string.punctuation) for char in text: if char in punctuations: text = text.replace(char, ' ') return text def tear_Is(text): words = ['that', 'this', 'there', 'he', 'she', 'it', 'what', 'who', 'when', 'where', 'how', 'everyone'] apostrophe = "'" for word in words: if str(word+apostrophe+'s') in text: text = text.replace(str(word+apostrophe+'s'), str(word+' '+'is')) return text def first_clean(text): text = clear_emoji(text) text = casefold(text) text = re.sub(r'\b\d+K{1}\b', ' thousand ', text, flags=re.IGNORECASE) text = re.sub(r'[0-9]+', ' ', text) # remove numbers text = re.sub(r"http\S+", ' ', text) # remove links with http text = re.sub(r"www.+", ' ', text) # remove links with www text = re.sub(r'#[a-zA-Z0-9]+', ' ', text) # remove hashtags text = re.sub(r'@[a-zA-Z0-9]+', ' ', text) # remove mentions text = text.replace("'", "'") text = text.replace("’", "'") text = text.replace("´", "'") text = text.replace("'d", " had") text = text.replace("-", " ") text = text.replace('\n', ' ') # replace new line into space return text def second_clean(text): text = tear_Is(text) text = text.strip() text = text.replace(" the f ", ' the fuck ') text = text.replace(" *s* ", ' ass ') text = text.replace("f*ed", "fucked") text = text.replace("f**ed", "fucked") text = text.replace("f*ck", 'fuck') text = text.replace("f*ck* ", 'fuck') text = text.replace("f*n", 'fucking') text = text.replace("f*ckn*", 'fucking') text = text.replace("f**in*", 'fucking') text = text.replace("sh*t", "shit") text = text.replace("sh*te", "shit") text = text.replace("s**t", "shit") text = text.replace("lol", "laugh out loud") text = text.replace("wuz", "was") text = text.replace(" wanna ", " want to ") text = text.replace(" won't ", " will not ") text = text.replace(" wont ", " will not ") text = text.replace(" isn't ", ' is not') text = text.replace(" ii ", ' two ') # replace ai'nt with not text = text.replace("yall", 'you all') # replace ai'nt with not text = text.replace("y'all", 'you all') # replace ai'nt with not text = text.replace("let's", 'let us') # replace ai'nt with not text = text.replace("thats", "that is") text = text.replace("lets", 'let us') # replace ai'nt with not text = text.replace("ain't", 'not') # replace ai'nt with not text = text.replace("aint", 'not') # replace ai'nt with not text = text.replace("can't", 'can not') # replace ai'nt with not text = text.replace("n't", ' not') # replace n't with not text = text.replace("i'm", 'i am') # replace i'm with i am text = text.replace(" i'm", 'i am') # replace i'm with i am text = text.replace(" dont ", 'do not') # replace i'm with i am text = text.replace("didnt", 'did not') # replace i'm with i am text = text.replace("doesnt", 'does not') # replace i'm with i am text = text.replace(" isnt ", ' is not ') # replace i'm with i am text = text.replace(" cant ", ' can not ') # replace i'm with i am text = text.replace(" im ", 'i am') # replace im with i am text = text.replace("'re", ' are') # replace 're with are text = text.replace("'ll", ' will') # replace 're with are text = text.replace("'ve", ' have') # replace 're with are text = text.replace(" da ", " the ") # replace da with the text = text.replace(" imo ", ' in my opinion ') text = text.replace(" og ", ' original ') text = text.replace(" ya ", ' you ') text = text.replace(" ppl ", " people ") text = text.replace(" nota ", " not a ") text = text.replace(" cuz ", " cause ") text = text.replace(" wth ", " what the heck ") text = text.replace("f*k", "fuck") text = text.replace("f k", "fuck") text = text.replace("d*k", "dick") text = text.replace(" i m ", " i am ") text = text.replace(" gg ", " glory glory ") text = text.replace(" btw ", " by the way ") text = text.replace(" ill ", " i will ") text = text.replace(" af ", " as fuck ") text = text.replace(" idk ", " i do not know ") text = text.replace("ffs", "for fuck sake") text = text.replace(" tho ", " though ") text = text.replace(" tf ", " the fuck ") text = text.replace(" bs ", " bullshit ") text = text.replace(" smh ", " shaking my head ") text = text.replace(" dei ", " diversity, equity, and inclusion ") text = text.replace("f*cked", 'fucked') text = text.replace("f*ked", 'fucked') text = text.replace("tha f", 'the fuck') return text def third_clean(text): text = replace_punctuations(text) # remove all punctuations text = text.replace(" rip ", ' rest in peace ') text = text.replace(" im ", " i am ") text = text.replace(" don t ", " do not ") text = text.replace(" iwill ", " i will ") text = text.replace(" st ", " first ") text = text.replace(" u ", " you ") text = text.replace(" the f ", ' the fuck ') text = text.replace(" f ck ", ' fuck ') text = text.replace(" f ck ", ' fuck ') text = text.replace(" f k ", ' fuck ') text = text.replace(" f king", ' fucking ') text = text.replace(" f it ", ' fuck it ') text = text.strip() # delete space at the start and end of string text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # remove special characters text = ' '.join(text.split()) return text def complete_clean(text): text = first_clean(text) text = second_clean(text) text = third_clean(text) text = text.strip() return text def tokenize(texts, tokenizer, max_length=512): encoded = tokenizer( list(texts), padding=True, truncation=True, max_length=max_length, return_tensors="tf" ) return encoded def predict(sentence, model, tokenizer, distilbert=False): labels = ['Negative', 'Neutral', 'Positive'] sentence = complete_clean(sentence) sentence = np.array([sentence]) tokenized = tokenize(texts=sentence, tokenizer=tokenizer) predictions = None if distilbert: predictions = model([ tokenized['input_ids'], tokenized['attention_mask'] ]) else: predictions = model([ tokenized['input_ids'], tokenized['token_type_ids'], tokenized['attention_mask'] ]) predictions = np.array(predictions[0]) label = labels[np.argmax(predictions)] confidence = np.max(predictions)*100 return label, confidence st.title("Sentiment Analysis with HuggingFace Spaces") option = st.selectbox( 'Choose a Model:', ['ALBERT-Base', 'DistilBERT-Base', 'BERT-Base'] ) directory_dict = { "ALBERT-Base":["src/models/albert_base/albert_tokenizer", "src/models/albert_base/albert_sentiment_model"], "DistilBERT-Base":["src/models/distilbert/distilbert_tokenizer", "src/models/distilbert/distilbert_sentiment_model"], "BERT-Base":["src/models/bert_base/bert_base_tokenizer", "src/models/bert_base/bert_base_sentiment_model"] } chosen_model = directory_dict[option] if option == 'DistilBERT-Base': distilbert=True else: distilbert=False loaded_tokenizer = AutoTokenizer.from_pretrained(chosen_model[0], local_files_only=True) loaded_model = tf.saved_model.load( chosen_model[1] ) st.write("Enter a sentence to analyze its sentiment:") user_input = st.text_input("") if user_input: result, confidence = predict(user_input, model=loaded_model, tokenizer=loaded_tokenizer, distilbert=distilbert) st.write(f"Model Chosen : {option}") st.write(f"Sentiment : {result}") st.write(f"Confidence : {confidence:.2f}%")