|
|
import streamlit as st
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import matplotlib.pyplot as plt
|
|
|
import seaborn as sns
|
|
|
import tensorflow as tf
|
|
|
import regex as re
|
|
|
import joblib
|
|
|
from tensorflow.keras.utils import pad_sequences
|
|
|
import base64
|
|
|
from gensim.models import Word2Vec
|
|
|
from sklearn.decomposition import PCA
|
|
|
|
|
|
st.markdown(
|
|
|
'<p style="color:white; font-size:40px; text-align: center;">Harry Potter text generation app</p>',
|
|
|
unsafe_allow_html=True
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_background_image(image_path):
|
|
|
"""
|
|
|
Set a background image in the Streamlit app using base64 encoding.
|
|
|
|
|
|
Parameters:
|
|
|
- image_path: str, path to the image file (e.g., 'background.jpg')
|
|
|
"""
|
|
|
|
|
|
with open(image_path, "rb") as image_file:
|
|
|
base64_image = base64.b64encode(image_file.read()).decode()
|
|
|
|
|
|
|
|
|
background_css = f"""
|
|
|
<style>
|
|
|
.stApp {{
|
|
|
background-image: url("data:image/jpeg;base64,{base64_image}");
|
|
|
background-size: cover;
|
|
|
background-position: center;
|
|
|
background-attachment: fixed;
|
|
|
}}
|
|
|
</style>
|
|
|
"""
|
|
|
|
|
|
st.markdown(background_css, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
set_background_image("hp_background.jpg")
|
|
|
|
|
|
|
|
|
st.logo("logo.png", size = "large")
|
|
|
|
|
|
des = '''This app takes sample input from user and
|
|
|
generate number of words from harry potter books
|
|
|
as given by user'''
|
|
|
st.markdown(
|
|
|
f'<p style="color:white; font-size:15px; text-align: center;">{des}</p>',
|
|
|
unsafe_allow_html=True
|
|
|
)
|
|
|
|
|
|
|
|
|
@st.cache_resource
|
|
|
def cache_model(tf_model_add, tk_add, w2v_add):
|
|
|
model = tf.keras.models.load_model(tf_model_add)
|
|
|
tk = joblib.load(tk_add)
|
|
|
wv_model = Word2Vec.load(w2v_add)
|
|
|
return model, tk, wv_model
|
|
|
|
|
|
tf_model_add = "hp_model.keras"
|
|
|
tk_add = "tokenizer.joblib"
|
|
|
w2v_add = "word2vec_model.model"
|
|
|
model, tk, wv_model = cache_model(tf_model_add, tk_add, w2v_add)
|
|
|
|
|
|
with st.sidebar:
|
|
|
chr_name = st.text_input("Enter a character name to get top 5 similar characters")
|
|
|
if chr_name:
|
|
|
try:
|
|
|
result = []
|
|
|
for i in wv_model.wv.most_similar(chr_name.lower(), topn = 5):
|
|
|
result.append(i[0])
|
|
|
for j in result:
|
|
|
st.markdown("- " + j)
|
|
|
except:
|
|
|
st.write("Please enter a valid character name")
|
|
|
|
|
|
chrs = st.multiselect(
|
|
|
"Select names to draw there vectors",
|
|
|
sorted(wv_model.wv.key_to_index.keys(), reverse = True),
|
|
|
["harry", "ron", "voldemort", "dobby", "elf"]
|
|
|
)
|
|
|
|
|
|
|
|
|
draw_vector_pressed = st.button("Draw vectors")
|
|
|
|
|
|
|
|
|
|
|
|
text = st.text_input("Enter Sample text to generate data")
|
|
|
num_words = st.number_input("Enter number of words to generate by model: ",
|
|
|
min_value= 1, max_value= 50, step = 1,
|
|
|
value = 5)
|
|
|
|
|
|
|
|
|
|
|
|
def clean_text(book):
|
|
|
book = book.lower()
|
|
|
exp = r"page\s*\|\s*\d+\s*harry potter.*?rowling"
|
|
|
book = re.sub(exp, " ", book)
|
|
|
|
|
|
alphabet_regex = "[^a-zA-Z0-9 .]+"
|
|
|
book = re.sub(alphabet_regex, "", book)
|
|
|
|
|
|
space_regex = "\s\s+"
|
|
|
book = re.sub(space_regex, " ", book)
|
|
|
return book
|
|
|
|
|
|
index_word = {v:k for k,v in tk.word_index.items()}
|
|
|
|
|
|
def next_word(test):
|
|
|
test_clean = clean_text(test)
|
|
|
test_token = tk.texts_to_sequences([test_clean])
|
|
|
pad_test = pad_sequences(test_token, maxlen =192, padding = "pre")
|
|
|
|
|
|
y_pred_prob = model.predict(pad_test)
|
|
|
y_pred_ind = np.argmax(y_pred_prob, axis = -1)
|
|
|
text = index_word[y_pred_ind[0]]
|
|
|
return text
|
|
|
|
|
|
if st.button("Submit"):
|
|
|
if len(text) < 1:
|
|
|
st.write("#### Please enter text to generate words")
|
|
|
else:
|
|
|
for i in range(num_words):
|
|
|
|
|
|
word = next_word(text)
|
|
|
|
|
|
text = text + " " + word
|
|
|
|
|
|
st.write(text)
|
|
|
|
|
|
|
|
|
if draw_vector_pressed == True:
|
|
|
if len(chrs) > 0:
|
|
|
chr_df = pd.DataFrame(data = wv_model.wv[chrs], index = chrs)
|
|
|
|
|
|
pca = PCA(n_components=2)
|
|
|
pca_array = pca.fit_transform(chr_df)
|
|
|
|
|
|
df_pca = pd.DataFrame(pca_array, index = chr_df.index, columns = ["pc1", "pc2"]).reset_index()
|
|
|
st.write("### Vector diagram for characters")
|
|
|
st.scatter_chart( df_pca,
|
|
|
x="pc1",
|
|
|
y="pc2",
|
|
|
color="index")
|
|
|
|
|
|
else:
|
|
|
st.write("Please select characters to draw vectors")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|