# importing Libraries import streamlit as st import PIL from PIL import Image import tensorflow as tf from nltk.stem import WordNetLemmatizer from nltk.tokenize import RegexpTokenizer import re import string import numpy as np import pandas as pd import nltk try: # Check if wordnet is installed nltk.find("corpora/wordnet.zip") except LookupError: nltk.download('wordnet') # ---------------------------------------------------------------------------------- # read files try: acronyms_dict, contractions_dict, stops except NameError: acronyms_dict = pd.read_json("acronym.json", typ = "series") contractions_dict = pd.read_json("contractions.json", typ = "series") stops = list(pd.read_csv('stopwords.csv').values.flatten()) # ---------------------------------------------------------------------------------- # Defining tokenizer regexp = RegexpTokenizer("[\w']+") # preprocess Function def preprocess(text): text = text.lower() # lowercase text = text.strip() # whitespaces # Removing html tags html = re.compile(r'<.*?>') text = html.sub(r'', text) # html tags # Removing emoji patterns emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags = re.UNICODE) text = emoji_pattern.sub(r'', text) # unicode char # Removing urls http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http") pattern = r"({})".format(http) # creating pattern text = re.sub(pattern, "", text) # remove urls # Removing twitter usernames pattern = r'@[\w_]+' text = re.sub(pattern, "", text) # remove @twitter usernames # Removing punctuations and numbers punct_str = string.punctuation + string.digits punct_str = punct_str.replace("'", "") punct_str = punct_str.replace("-", "") text = text.translate(str.maketrans('', '', punct_str)) # punctuation and numbers # Replacing "-" in text with empty space text = text.replace("-", " ") # "-" # Substituting acronyms words = [] for word in regexp.tokenize(text): if word in acronyms_dict.index: words = words + acronyms_dict[word].split() else: words = words + word.split() text = ' '.join(words) # acronyms # Substituting Contractions words = [] for word in regexp.tokenize(text): if word in contractions_dict.index: words = words + contractions_dict[word].split() else: words = words + word.split() text = " ".join(words) # contractions punct_str = string.punctuation text = text.translate(str.maketrans('', '', punct_str)) # punctuation again to remove "'" # lemmatization lemmatizer = WordNetLemmatizer() text = " ".join([lemmatizer.lemmatize(word) for word in regexp.tokenize(text)]) # lemmatize # Stopwords Removal text = ' '.join([word for word in regexp.tokenize(text) if word not in stops]) # stopwords # Removing all characters except alphabets and " " (space) filter = string.ascii_letters + " " text = "".join([chr for chr in text if chr in filter]) # remove all characters except alphabets and " " (space) # Removing words with one alphabet occuring more than 3 times continuously pattern = r'\b\w*?(.)\1{2,}\w*\b' text = re.sub(pattern, "", text).strip() # remove words with one alphabet occuring more than 3 times continuously # Removing words with less than 3 characters short_words = r'\b\w{1,2}\b' text = re.sub(short_words, "", text) # remove words with less than 3 characters # return final output return text # =============================================================================================================== # STREAMLIT # App Devolopment Starts st.set_page_config(layout="wide") st.write("# Disaster Tweet Predictor") img = Image.open("dis_image.png") st.image(img) tweet = st.text_input(label = "Enter or paste your tweet here", value = "") # Defining a function to store the model in streamlit cache memory @st.cache_resource def cache_model(model_name): model = tf.keras.models.load_model(model_name) return model model = cache_model("transfer_tweet") # if user gives any input if len(tweet) > 0: clean_tweet = preprocess(tweet) # cleans tweet y_pred = model.predict([clean_tweet]) # gives probability of class = 1 y_pred_num = int(np.round(y_pred)[0][0]) # get final prediction of output class if y_pred_num == 0: st.write(f"#### Non-Disaster tweet with disaster probability {round(y_pred[0][0]*100, 4)}%") else: st.write(f"#### Disaster tweet with disaster probability {round(y_pred[0][0]*100, 4)}%") # ==============================================================================================================