GS123 commited on
Commit
29b7f2d
·
verified ·
1 Parent(s): 276bb84

Upload 7 files

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. app.py +161 -0
  3. hp_background.jpg +0 -0
  4. hp_model.keras +3 -0
  5. logo.png +0 -0
  6. requirements.txt +10 -0
  7. tokenizer.joblib +3 -0
  8. word2vec_model.model +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ hp_model.keras filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import tensorflow as tf
7
+ import regex as re
8
+ import joblib
9
+ from tensorflow.keras.utils import pad_sequences
10
+ import base64
11
+ from gensim.models import Word2Vec
12
+ from sklearn.decomposition import PCA
13
+
14
+ st.markdown(
15
+ '<p style="color:white; font-size:40px; text-align: center;">Harry Potter text generation app</p>',
16
+ unsafe_allow_html=True
17
+ )
18
+
19
+
20
+
21
+ # Function to set the background image
22
+ def set_background_image(image_path):
23
+ """
24
+ Set a background image in the Streamlit app using base64 encoding.
25
+
26
+ Parameters:
27
+ - image_path: str, path to the image file (e.g., 'background.jpg')
28
+ """
29
+ # Read and encode the image
30
+ with open(image_path, "rb") as image_file:
31
+ base64_image = base64.b64encode(image_file.read()).decode()
32
+
33
+ # Create the CSS for the background
34
+ background_css = f"""
35
+ <style>
36
+ .stApp {{
37
+ background-image: url("data:image/jpeg;base64,{base64_image}");
38
+ background-size: cover;
39
+ background-position: center;
40
+ background-attachment: fixed;
41
+ }}
42
+ </style>
43
+ """
44
+ # Inject the CSS into the Streamlit app
45
+ st.markdown(background_css, unsafe_allow_html=True)
46
+
47
+ # Set the background image
48
+ set_background_image("hp_background.jpg")
49
+
50
+
51
+ st.logo("logo.png", size = "large")
52
+
53
+ des = '''This app takes sample input from user and
54
+ generate number of words from harry potter books
55
+ as given by user'''
56
+ st.markdown(
57
+ f'<p style="color:white; font-size:15px; text-align: center;">{des}</p>',
58
+ unsafe_allow_html=True
59
+ )
60
+
61
+ # load model
62
+ @st.cache_resource
63
+ def cache_model(tf_model_add, tk_add, w2v_add):
64
+ model = tf.keras.models.load_model(tf_model_add)
65
+ tk = joblib.load(tk_add)
66
+ wv_model = Word2Vec.load(w2v_add)
67
+ return model, tk, wv_model
68
+
69
+ tf_model_add = "hp_model.keras"
70
+ tk_add = "tokenizer.joblib"
71
+ w2v_add = "word2vec_model.model"
72
+ model, tk, wv_model = cache_model(tf_model_add, tk_add, w2v_add)
73
+
74
+ with st.sidebar:
75
+ chr_name = st.text_input("Enter a character name to get top 5 similar characters")
76
+ if chr_name:
77
+ try:
78
+ result = []
79
+ for i in wv_model.wv.most_similar(chr_name.lower(), topn = 5):
80
+ result.append(i[0])
81
+ for j in result:
82
+ st.markdown("- " + j)
83
+ except:
84
+ st.write("Please enter a valid character name")
85
+
86
+ chrs = st.multiselect(
87
+ "Select names to draw there vectors",
88
+ sorted(wv_model.wv.key_to_index.keys(), reverse = True),
89
+ ["harry", "ron", "voldemort", "dobby", "elf"]
90
+ )
91
+
92
+
93
+ draw_vector_pressed = st.button("Draw vectors")
94
+
95
+
96
+
97
+ text = st.text_input("Enter Sample text to generate data")
98
+ num_words = st.number_input("Enter number of words to generate by model: ",
99
+ min_value= 1, max_value= 50, step = 1,
100
+ value = 5)
101
+
102
+
103
+
104
+ def clean_text(book):
105
+ book = book.lower()
106
+ exp = r"page\s*\|\s*\d+\s*harry potter.*?rowling"
107
+ book = re.sub(exp, " ", book)
108
+
109
+ alphabet_regex = "[^a-zA-Z0-9 .]+"
110
+ book = re.sub(alphabet_regex, "", book)
111
+
112
+ space_regex = "\s\s+"
113
+ book = re.sub(space_regex, " ", book)
114
+ return book
115
+
116
+ index_word = {v:k for k,v in tk.word_index.items()}
117
+
118
+ def next_word(test):
119
+ test_clean = clean_text(test)
120
+ test_token = tk.texts_to_sequences([test_clean])
121
+ pad_test = pad_sequences(test_token, maxlen =192, padding = "pre")
122
+ # pad_test
123
+ y_pred_prob = model.predict(pad_test)
124
+ y_pred_ind = np.argmax(y_pred_prob, axis = -1)
125
+ text = index_word[y_pred_ind[0]]
126
+ return text
127
+
128
+ if st.button("Submit"):
129
+ if len(text) < 1:
130
+ st.write("#### Please enter text to generate words")
131
+ else:
132
+ for i in range(num_words):
133
+
134
+ word = next_word(text)
135
+ # print(test + " " + word)
136
+ text = text + " " + word
137
+
138
+ st.write(text)
139
+
140
+
141
+ if draw_vector_pressed == True:
142
+ if len(chrs) > 0:
143
+ chr_df = pd.DataFrame(data = wv_model.wv[chrs], index = chrs)
144
+
145
+ pca = PCA(n_components=2)
146
+ pca_array = pca.fit_transform(chr_df)
147
+
148
+ df_pca = pd.DataFrame(pca_array, index = chr_df.index, columns = ["pc1", "pc2"]).reset_index()
149
+ st.write("### Vector diagram for characters")
150
+ st.scatter_chart( df_pca,
151
+ x="pc1",
152
+ y="pc2",
153
+ color="index")
154
+
155
+ else:
156
+ st.write("Please select characters to draw vectors")
157
+
158
+
159
+
160
+
161
+
hp_background.jpg ADDED
hp_model.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49c35e22cc4c65f44e2eb1477eacd26c90d532c49700c3a43f983d00998eb514
3
+ size 17527806
logo.png ADDED
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ tensorflow==2.17.1
2
+ gensim==4.3.3
3
+ joblib==1.4.2
4
+ numpy==1.26.4
5
+ pandas==2.2.2
6
+ matplotlib==3.10.0
7
+ regex==2024.11.6
8
+ scikit-learn==1.6.0
9
+ seaborn==0.13.2
10
+ streamlit==1.41.1
tokenizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5098539c7180fbcd53ee121c55da7aa7f2fbd56d19b1a3bbd519aa00be296f4e
3
+ size 234653
word2vec_model.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b33706aa2f2b2b322194457cf95341ad9ea61bbe473903d9f870d309ddfea01c
3
+ size 14578634