prithush's picture
Upload 11 files
59adbc8
# importing Libraries
import streamlit as st
import PIL
from PIL import Image
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import numpy as np
import pandas as pd
import nltk
try: # Check if wordnet is installed
nltk.find("corpora/wordnet.zip")
except LookupError:
nltk.download('wordnet')
# ----------------------------------------------------------------------------------
# read files
try:
acronyms_dict, contractions_dict, stops
except NameError:
acronyms_dict = pd.read_json("acronym.json", typ = "series")
contractions_dict = pd.read_json("contractions.json", typ = "series")
stops = list(pd.read_csv('stopwords.csv').values.flatten())
# ----------------------------------------------------------------------------------
# Defining tokenizer
regexp = RegexpTokenizer("[\w']+")
# preprocess Function
def preprocess(text):
text = text.lower() # lowercase
text = text.strip() # whitespaces
# Removing html tags
html = re.compile(r'<.*?>')
text = html.sub(r'', text) # html tags
# Removing emoji patterns
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags = re.UNICODE)
text = emoji_pattern.sub(r'', text) # unicode char
# Removing urls
http = "https?://\S+|www\.\S+" # matching strings beginning with http (but not just "http")
pattern = r"({})".format(http) # creating pattern
text = re.sub(pattern, "", text) # remove urls
# Removing twitter usernames
pattern = r'@[\w_]+'
text = re.sub(pattern, "", text) # remove @twitter usernames
# Removing punctuations and numbers
punct_str = string.punctuation + string.digits
punct_str = punct_str.replace("'", "")
punct_str = punct_str.replace("-", "")
text = text.translate(str.maketrans('', '', punct_str)) # punctuation and numbers
# Replacing "-" in text with empty space
text = text.replace("-", " ") # "-"
# Substituting acronyms
words = []
for word in regexp.tokenize(text):
if word in acronyms_dict.index:
words = words + acronyms_dict[word].split()
else:
words = words + word.split()
text = ' '.join(words) # acronyms
# Substituting Contractions
words = []
for word in regexp.tokenize(text):
if word in contractions_dict.index:
words = words + contractions_dict[word].split()
else:
words = words + word.split()
text = " ".join(words) # contractions
punct_str = string.punctuation
text = text.translate(str.maketrans('', '', punct_str)) # punctuation again to remove "'"
# lemmatization
lemmatizer = WordNetLemmatizer()
text = " ".join([lemmatizer.lemmatize(word) for word in regexp.tokenize(text)]) # lemmatize
# Stopwords Removal
text = ' '.join([word for word in regexp.tokenize(text) if word not in stops]) # stopwords
# Removing all characters except alphabets and " " (space)
filter = string.ascii_letters + " "
text = "".join([chr for chr in text if chr in filter]) # remove all characters except alphabets and " " (space)
# Removing words with one alphabet occuring more than 3 times continuously
pattern = r'\b\w*?(.)\1{2,}\w*\b'
text = re.sub(pattern, "", text).strip() # remove words with one alphabet occuring more than 3 times continuously
# Removing words with less than 3 characters
short_words = r'\b\w{1,2}\b'
text = re.sub(short_words, "", text) # remove words with less than 3 characters
# return final output
return text
# ===============================================================================================================
# STREAMLIT
# App Devolopment Starts
st.set_page_config(layout="wide")
st.write("# Disaster Tweet Predictor")
img = Image.open("dis_image.png")
st.image(img)
tweet = st.text_input(label = "Enter or paste your tweet here", value = "")
# Defining a function to store the model in streamlit cache memory
@st.cache_resource
def cache_model(model_name):
model = tf.keras.models.load_model(model_name)
return model
model = cache_model("transfer_tweet")
# if user gives any input
if len(tweet) > 0:
clean_tweet = preprocess(tweet) # cleans tweet
y_pred = model.predict([clean_tweet]) # gives probability of class = 1
y_pred_num = int(np.round(y_pred)[0][0]) # get final prediction of output class
if y_pred_num == 0:
st.write(f"#### Non-Disaster tweet with disaster probability {round(y_pred[0][0]*100, 4)}%")
else:
st.write(f"#### Disaster tweet with disaster probability {round(y_pred[0][0]*100, 4)}%")
# ==============================================================================================================