import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import tensorflow as tf import regex as re import joblib from tensorflow.keras.utils import pad_sequences import base64 from gensim.models import Word2Vec from sklearn.decomposition import PCA st.markdown( '
Harry Potter text generation app
', unsafe_allow_html=True ) # Function to set the background image def set_background_image(image_path): """ Set a background image in the Streamlit app using base64 encoding. Parameters: - image_path: str, path to the image file (e.g., 'background.jpg') """ # Read and encode the image with open(image_path, "rb") as image_file: base64_image = base64.b64encode(image_file.read()).decode() # Create the CSS for the background background_css = f""" """ # Inject the CSS into the Streamlit app st.markdown(background_css, unsafe_allow_html=True) # Set the background image set_background_image("hp_background.jpg") st.logo("logo.png", size = "large") des = '''This app takes sample input from user and generate number of words from harry potter books as given by user''' st.markdown( f'{des}
', unsafe_allow_html=True ) # load model @st.cache_resource def cache_model(tf_model_add, tk_add, w2v_add): model = tf.keras.models.load_model(tf_model_add) tk = joblib.load(tk_add) wv_model = Word2Vec.load(w2v_add) return model, tk, wv_model tf_model_add = "hp_model.keras" tk_add = "tokenizer.joblib" w2v_add = "word2vec_model.model" model, tk, wv_model = cache_model(tf_model_add, tk_add, w2v_add) with st.sidebar: chr_name = st.text_input("Enter a character name to get top 5 similar characters") if chr_name: try: result = [] for i in wv_model.wv.most_similar(chr_name.lower(), topn = 5): result.append(i[0]) for j in result: st.markdown("- " + j) except: st.write("Please enter a valid character name") chrs = st.multiselect( "Select names to draw there vectors", sorted(wv_model.wv.key_to_index.keys(), reverse = True), ["harry", "ron", "voldemort", "dobby", "elf"] ) draw_vector_pressed = st.button("Draw vectors") text = st.text_input("Enter Sample text to generate data") num_words = st.number_input("Enter number of words to generate by model: ", min_value= 1, max_value= 50, step = 1, value = 5) def clean_text(book): book = book.lower() exp = r"page\s*\|\s*\d+\s*harry potter.*?rowling" book = re.sub(exp, " ", book) alphabet_regex = "[^a-zA-Z0-9 .]+" book = re.sub(alphabet_regex, "", book) space_regex = "\s\s+" book = re.sub(space_regex, " ", book) return book index_word = {v:k for k,v in tk.word_index.items()} def next_word(test): test_clean = clean_text(test) test_token = tk.texts_to_sequences([test_clean]) pad_test = pad_sequences(test_token, maxlen =192, padding = "pre") # pad_test y_pred_prob = model.predict(pad_test) y_pred_ind = np.argmax(y_pred_prob, axis = -1) text = index_word[y_pred_ind[0]] return text if st.button("Submit"): if len(text) < 1: st.write("#### Please enter text to generate words") else: for i in range(num_words): word = next_word(text) # print(test + " " + word) text = text + " " + word st.write(text) if draw_vector_pressed == True: if len(chrs) > 0: chr_df = pd.DataFrame(data = wv_model.wv[chrs], index = chrs) pca = PCA(n_components=2) pca_array = pca.fit_transform(chr_df) df_pca = pd.DataFrame(pca_array, index = chr_df.index, columns = ["pc1", "pc2"]).reset_index() st.write("### Vector diagram for characters") st.scatter_chart( df_pca, x="pc1", y="pc2", color="index") else: st.write("Please select characters to draw vectors")