Blessmore commited on
Commit
f851013
·
verified ·
1 Parent(s): f28807d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -3
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from gensim.models import FastText
3
  import re
4
  from gensim.utils import simple_preprocess
5
  import time
@@ -9,6 +9,8 @@ import io
9
  import tempfile
10
  import numpy as np
11
  from concurrent.futures import ThreadPoolExecutor
 
 
12
 
13
  # Function to preprocess text
14
  def preprocess_text(text):
@@ -65,14 +67,61 @@ def clean_text_multithreaded(text):
65
  cleaned_chunks = list(executor.map(clean_text_chunk, chunks))
66
  return '\n'.join(cleaned_chunks)
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # Streamlit app
69
  def main():
70
  st.title("Text Processing and FastText Word Embedding Trainer")
71
 
72
  # Sidebar options
73
  st.sidebar.title("Options")
74
- option = st.sidebar.radio("Select an option", ("Clean Dataset", "Train Word Embedding"))
75
-
76
  if option == "Clean Dataset":
77
  st.header("Clean Text Dataset")
78
 
@@ -153,6 +202,74 @@ def main():
153
  except Exception as e:
154
  st.error(f"An error occurred: {str(e)}")
155
  st.error("Check the server logs for more details.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  if __name__ == "__main__":
158
  main()
 
1
  import streamlit as st
2
+ from gensim.models import FastText, KeyedVectors
3
  import re
4
  from gensim.utils import simple_preprocess
5
  import time
 
9
  import tempfile
10
  import numpy as np
11
  from concurrent.futures import ThreadPoolExecutor
12
+ from huggingface_hub import hf_hub_download
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
 
15
  # Function to preprocess text
16
  def preprocess_text(text):
 
67
  cleaned_chunks = list(executor.map(clean_text_chunk, chunks))
68
  return '\n'.join(cleaned_chunks)
69
 
70
+ # Function to load the FastText model from Hugging Face
71
+ @st.cache_resource
72
+ def load_fasttext_model(model_dir):
73
+ model_path = os.path.join(model_dir, "fasttext_model.model")
74
+ vectors_path = os.path.join(model_dir, "fasttext_model_vectors.kv")
75
+ vectors_ngrams_path = os.path.join(model_dir, "fasttext_model.model.wv.vectors_ngrams.npy")
76
+
77
+ model = FastText.load(model_path)
78
+ model.wv = KeyedVectors.load(vectors_path, mmap='r')
79
+ model.wv.vectors_ngrams = np.load(vectors_ngrams_path, mmap_mode='r')
80
+
81
+ return model
82
+
83
+
84
+ # Function to generate embeddings for a given word
85
+ def generate_word_embedding(word, model):
86
+ return model.wv.get_vector(word, norm=True) if word in model.wv else None
87
+
88
+ # Function to find similar words
89
+ def find_similar_words(word, model, topn=5):
90
+ return model.wv.most_similar(word, topn=topn) if word in model.wv else []
91
+
92
+ # Function to tokenize a sentence using the given pattern
93
+ def tokenize_sentence(sentence, pattern):
94
+ tokens = re.findall(pattern, sentence)
95
+ return [token.strip() for token in tokens if token.strip()]
96
+
97
+ # Function to generate embeddings for words in a sentence
98
+ def generate_embeddings_for_sentence(sentence, model, pattern):
99
+ tokens = tokenize_sentence(sentence, pattern)
100
+ embeddings = []
101
+ for token in tokens:
102
+ if token in model.wv:
103
+ embeddings.append(model.wv[token])
104
+ return embeddings
105
+
106
+ # Function to generate embedding for a sentence
107
+ def generate_sentence_embedding(sentence, model, pattern):
108
+ word_embeddings = generate_embeddings_for_sentence(sentence, model, pattern)
109
+ if not word_embeddings:
110
+ return None
111
+ return np.mean(word_embeddings, axis=0)
112
+
113
+ # Function to generate embeddings for sentences
114
+ def generate_sentence_embeddings(sentences, model, pattern):
115
+ return [generate_sentence_embedding(sentence, model, pattern) for sentence in sentences]
116
+
117
  # Streamlit app
118
  def main():
119
  st.title("Text Processing and FastText Word Embedding Trainer")
120
 
121
  # Sidebar options
122
  st.sidebar.title("Options")
123
+ option = st.sidebar.radio("Select an option", ("Clean Dataset", "Train Word Embedding", "Generate Embeddings"))
124
+
125
  if option == "Clean Dataset":
126
  st.header("Clean Text Dataset")
127
 
 
202
  except Exception as e:
203
  st.error(f"An error occurred: {str(e)}")
204
  st.error("Check the server logs for more details.")
205
+
206
+ elif option == "Generate Embeddings":
207
+ st.header("Generate Embeddings with Pretrained FastText Model")
208
+
209
+ repo_id = "Blessmore/Fasttext_embeddings/Fast_text_50_dim"
210
+ model_file = "fasttext_model.model"
211
+ vectors_file = "fasttext_model_vectors.kv"
212
+ vectors_ngrams_file = "fasttext_model.model.wv.vectors_ngrams.npy"
213
+
214
+ model = load_fasttext_model(repo_id, model_file, vectors_file, vectors_ngrams_file)
215
+
216
+ st.subheader("Generate Word Embedding")
217
+ word = st.text_input("Enter a word:")
218
+ if word:
219
+ embedding = generate_word_embedding(word, model)
220
+ if embedding is not None:
221
+ st.write(f"Embedding for '{word}':", embedding)
222
+ else:
223
+ st.write(f"'{word}' not in vocabulary")
224
+
225
+ st.subheader("Find Similar Words")
226
+ word_for_similar = st.text_input("Enter a word to find similar words:")
227
+ if word_for_similar:
228
+ similar_words = find_similar_words(word_for_similar, model)
229
+ if similar_words:
230
+ st.write("Similar words:")
231
+ for word, similarity in similar_words:
232
+ st.write(f"{word}: {similarity}")
233
+ else:
234
+ st.write(f"No similar words found for '{word_for_similar}'")
235
+
236
+ st.subheader("Generate Embeddings for Words in a Sentence")
237
+ sentence = st.text_input("Enter a sentence:")
238
+ if sentence:
239
+ word_embeddings = generate_embeddings_for_sentence(sentence, model, r'\b\w+\b')
240
+ if word_embeddings:
241
+ for idx, embedding in enumerate(word_embeddings):
242
+ st.write(f"Word {idx+1} embedding:", embedding)
243
+ else:
244
+ st.write("No embeddings could be generated for the words in the sentence.")
245
+
246
+ st.subheader("Generate Embedding for a Sentence")
247
+ sentence_for_embedding = st.text_input("Enter a sentence to generate its embedding:")
248
+ if sentence_for_embedding:
249
+ sentence_embedding = generate_sentence_embedding(sentence_for_embedding, model, r'\b\w+\b')
250
+ if sentence_embedding is not None:
251
+ st.write("Sentence embedding:", sentence_embedding)
252
+ else:
253
+ st.write("No embedding could be generated for the sentence.")
254
+
255
+ st.subheader("Find Most Similar Sentence Pairs")
256
+ uploaded_sentences_file = st.file_uploader("Upload a text file with sentences (one per line)", type=["txt"])
257
+ if uploaded_sentences_file:
258
+ sentences = uploaded_sentences_file.read().decode('utf-8').splitlines()
259
+ sentence_embeddings = generate_sentence_embeddings(sentences, model, r'\b\w+\b')
260
+ sentence_pairs = []
261
+ for i in range(len(sentences)):
262
+ for j in range(i + 1, len(sentences)):
263
+ if sentence_embeddings[i] is not None and sentence_embeddings[j] is not None:
264
+ similarity = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[j]])[0][0]
265
+ sentence_pairs.append((sentences[i], sentences[j], similarity))
266
+ sentence_pairs = sorted(sentence_pairs, key=lambda x: x[2], reverse=True)
267
+ st.write("Most similar sentence pairs:")
268
+ for sent1, sent2, sim in sentence_pairs[:5]:
269
+ st.write(f"Sentence 1: {sent1}")
270
+ st.write(f"Sentence 2: {sent2}")
271
+ st.write(f"Similarity: {sim}")
272
+ st.write("-----")
273
 
274
  if __name__ == "__main__":
275
  main()