Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import numpy as np | |
| import re, emoji, string, os | |
| os.environ['TF_USE_LEGACY_KERAS'] = '1' | |
| from transformers import TFAutoModel, AutoTokenizer | |
| import tensorflow as tf | |
| def clear_emoji(text): | |
| return emoji.replace_emoji(text, ' ') | |
| def casefold(text): | |
| return text.lower() | |
| def replace_punctuations(text): | |
| punctuations = set(string.punctuation) | |
| for char in text: | |
| if char in punctuations: | |
| text = text.replace(char, ' ') | |
| return text | |
| def tear_Is(text): | |
| words = ['that', 'this', 'there', 'he', 'she', 'it', 'what', 'who', 'when', 'where', 'how', 'everyone'] | |
| apostrophe = "'" | |
| for word in words: | |
| if str(word+apostrophe+'s') in text: | |
| text = text.replace(str(word+apostrophe+'s'), str(word+' '+'is')) | |
| return text | |
| def first_clean(text): | |
| text = clear_emoji(text) | |
| text = casefold(text) | |
| text = re.sub(r'\b\d+K{1}\b', ' thousand ', text, flags=re.IGNORECASE) | |
| text = re.sub(r'[0-9]+', ' ', text) # remove numbers | |
| text = re.sub(r"http\S+", ' ', text) # remove links with http | |
| text = re.sub(r"www.+", ' ', text) # remove links with www | |
| text = re.sub(r'#[a-zA-Z0-9]+', ' ', text) # remove hashtags | |
| text = re.sub(r'@[a-zA-Z0-9]+', ' ', text) # remove mentions | |
| text = text.replace("'", "'") | |
| text = text.replace("’", "'") | |
| text = text.replace("´", "'") | |
| text = text.replace("'d", " had") | |
| text = text.replace("-", " ") | |
| text = text.replace('\n', ' ') # replace new line into space | |
| return text | |
| def second_clean(text): | |
| text = tear_Is(text) | |
| text = text.strip() | |
| text = text.replace(" the f ", ' the fuck ') | |
| text = text.replace(" *s* ", ' ass ') | |
| text = text.replace("f*ed", "fucked") | |
| text = text.replace("f**ed", "fucked") | |
| text = text.replace("f*ck", 'fuck') | |
| text = text.replace("f*ck* ", 'fuck') | |
| text = text.replace("f*n", 'fucking') | |
| text = text.replace("f*ckn*", 'fucking') | |
| text = text.replace("f**in*", 'fucking') | |
| text = text.replace("sh*t", "shit") | |
| text = text.replace("sh*te", "shit") | |
| text = text.replace("s**t", "shit") | |
| text = text.replace("lol", "laugh out loud") | |
| text = text.replace("wuz", "was") | |
| text = text.replace(" wanna ", " want to ") | |
| text = text.replace(" won't ", " will not ") | |
| text = text.replace(" wont ", " will not ") | |
| text = text.replace(" isn't ", ' is not') | |
| text = text.replace(" ii ", ' two ') # replace ai'nt with not | |
| text = text.replace("yall", 'you all') # replace ai'nt with not | |
| text = text.replace("y'all", 'you all') # replace ai'nt with not | |
| text = text.replace("let's", 'let us') # replace ai'nt with not | |
| text = text.replace("thats", "that is") | |
| text = text.replace("lets", 'let us') # replace ai'nt with not | |
| text = text.replace("ain't", 'not') # replace ai'nt with not | |
| text = text.replace("aint", 'not') # replace ai'nt with not | |
| text = text.replace("can't", 'can not') # replace ai'nt with not | |
| text = text.replace("n't", ' not') # replace n't with not | |
| text = text.replace("i'm", 'i am') # replace i'm with i am | |
| text = text.replace(" i'm", 'i am') # replace i'm with i am | |
| text = text.replace(" dont ", 'do not') # replace i'm with i am | |
| text = text.replace("didnt", 'did not') # replace i'm with i am | |
| text = text.replace("doesnt", 'does not') # replace i'm with i am | |
| text = text.replace(" isnt ", ' is not ') # replace i'm with i am | |
| text = text.replace(" cant ", ' can not ') # replace i'm with i am | |
| text = text.replace(" im ", 'i am') # replace im with i am | |
| text = text.replace("'re", ' are') # replace 're with are | |
| text = text.replace("'ll", ' will') # replace 're with are | |
| text = text.replace("'ve", ' have') # replace 're with are | |
| text = text.replace(" da ", " the ") # replace da with the | |
| text = text.replace(" imo ", ' in my opinion ') | |
| text = text.replace(" og ", ' original ') | |
| text = text.replace(" ya ", ' you ') | |
| text = text.replace(" ppl ", " people ") | |
| text = text.replace(" nota ", " not a ") | |
| text = text.replace(" cuz ", " cause ") | |
| text = text.replace(" wth ", " what the heck ") | |
| text = text.replace("f*k", "fuck") | |
| text = text.replace("f k", "fuck") | |
| text = text.replace("d*k", "dick") | |
| text = text.replace(" i m ", " i am ") | |
| text = text.replace(" gg ", " glory glory ") | |
| text = text.replace(" btw ", " by the way ") | |
| text = text.replace(" ill ", " i will ") | |
| text = text.replace(" af ", " as fuck ") | |
| text = text.replace(" idk ", " i do not know ") | |
| text = text.replace("ffs", "for fuck sake") | |
| text = text.replace(" tho ", " though ") | |
| text = text.replace(" tf ", " the fuck ") | |
| text = text.replace(" bs ", " bullshit ") | |
| text = text.replace(" smh ", " shaking my head ") | |
| text = text.replace(" dei ", " diversity, equity, and inclusion ") | |
| text = text.replace("f*cked", 'fucked') | |
| text = text.replace("f*ked", 'fucked') | |
| text = text.replace("tha f", 'the fuck') | |
| return text | |
| def third_clean(text): | |
| text = replace_punctuations(text) # remove all punctuations | |
| text = text.replace(" rip ", ' rest in peace ') | |
| text = text.replace(" im ", " i am ") | |
| text = text.replace(" don t ", " do not ") | |
| text = text.replace(" iwill ", " i will ") | |
| text = text.replace(" st ", " first ") | |
| text = text.replace(" u ", " you ") | |
| text = text.replace(" the f ", ' the fuck ') | |
| text = text.replace(" f ck ", ' fuck ') | |
| text = text.replace(" f ck ", ' fuck ') | |
| text = text.replace(" f k ", ' fuck ') | |
| text = text.replace(" f king", ' fucking ') | |
| text = text.replace(" f it ", ' fuck it ') | |
| text = text.strip() # delete space at the start and end of string | |
| text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) # remove special characters | |
| text = ' '.join(text.split()) | |
| return text | |
| def complete_clean(text): | |
| text = first_clean(text) | |
| text = second_clean(text) | |
| text = third_clean(text) | |
| text = text.strip() | |
| return text | |
| def tokenize(texts, tokenizer, max_length=512): | |
| encoded = tokenizer( | |
| list(texts), | |
| padding=True, | |
| truncation=True, | |
| max_length=max_length, | |
| return_tensors="tf" | |
| ) | |
| return encoded | |
| def predict(sentence, model, tokenizer, distilbert=False): | |
| labels = ['Negative', 'Neutral', 'Positive'] | |
| sentence = complete_clean(sentence) | |
| sentence = np.array([sentence]) | |
| tokenized = tokenize(texts=sentence, tokenizer=tokenizer) | |
| predictions = None | |
| if distilbert: | |
| predictions = model([ | |
| tokenized['input_ids'], | |
| tokenized['attention_mask'] | |
| ]) | |
| else: | |
| predictions = model([ | |
| tokenized['input_ids'], | |
| tokenized['token_type_ids'], | |
| tokenized['attention_mask'] | |
| ]) | |
| predictions = np.array(predictions[0]) | |
| label = labels[np.argmax(predictions)] | |
| confidence = np.max(predictions)*100 | |
| return label, confidence | |
| st.title("Sentiment Analysis with HuggingFace Spaces") | |
| option = st.selectbox( | |
| 'Choose a Model:', | |
| ['ALBERT-Base', 'DistilBERT-Base', 'BERT-Base'] | |
| ) | |
| directory_dict = { | |
| "ALBERT-Base":["src/models/albert_base/albert_tokenizer", "src/models/albert_base/albert_sentiment_model"], | |
| "DistilBERT-Base":["src/models/distilbert/distilbert_tokenizer", "src/models/distilbert/distilbert_sentiment_model"], | |
| "BERT-Base":["src/models/bert_base/bert_base_tokenizer", "src/models/bert_base/bert_base_sentiment_model"] | |
| } | |
| chosen_model = directory_dict[option] | |
| if option == 'DistilBERT-Base': | |
| distilbert=True | |
| else: | |
| distilbert=False | |
| loaded_tokenizer = AutoTokenizer.from_pretrained(chosen_model[0], local_files_only=True) | |
| loaded_model = tf.saved_model.load( | |
| chosen_model[1] | |
| ) | |
| st.write("Enter a sentence to analyze its sentiment:") | |
| user_input = st.text_input("") | |
| if user_input: | |
| result, confidence = predict(user_input, model=loaded_model, tokenizer=loaded_tokenizer, distilbert=distilbert) | |
| st.write(f"Model Chosen : {option}") | |
| st.write(f"Sentiment : {result}") | |
| st.write(f"Confidence : {confidence:.2f}%") |