Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
from gensim.models import FastText
|
| 3 |
import re
|
| 4 |
from gensim.utils import simple_preprocess
|
| 5 |
import time
|
|
@@ -9,6 +9,8 @@ import io
|
|
| 9 |
import tempfile
|
| 10 |
import numpy as np
|
| 11 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Function to preprocess text
|
| 14 |
def preprocess_text(text):
|
|
@@ -65,14 +67,61 @@ def clean_text_multithreaded(text):
|
|
| 65 |
cleaned_chunks = list(executor.map(clean_text_chunk, chunks))
|
| 66 |
return '\n'.join(cleaned_chunks)
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Streamlit app
|
| 69 |
def main():
|
| 70 |
st.title("Text Processing and FastText Word Embedding Trainer")
|
| 71 |
|
| 72 |
# Sidebar options
|
| 73 |
st.sidebar.title("Options")
|
| 74 |
-
option = st.sidebar.radio("Select an option", ("Clean Dataset", "Train Word Embedding"))
|
| 75 |
-
|
| 76 |
if option == "Clean Dataset":
|
| 77 |
st.header("Clean Text Dataset")
|
| 78 |
|
|
@@ -153,6 +202,74 @@ def main():
|
|
| 153 |
except Exception as e:
|
| 154 |
st.error(f"An error occurred: {str(e)}")
|
| 155 |
st.error("Check the server logs for more details.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
| 158 |
main()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from gensim.models import FastText, KeyedVectors
|
| 3 |
import re
|
| 4 |
from gensim.utils import simple_preprocess
|
| 5 |
import time
|
|
|
|
| 9 |
import tempfile
|
| 10 |
import numpy as np
|
| 11 |
from concurrent.futures import ThreadPoolExecutor
|
| 12 |
+
from huggingface_hub import hf_hub_download
|
| 13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 14 |
|
| 15 |
# Function to preprocess text
|
| 16 |
def preprocess_text(text):
|
|
|
|
| 67 |
cleaned_chunks = list(executor.map(clean_text_chunk, chunks))
|
| 68 |
return '\n'.join(cleaned_chunks)
|
| 69 |
|
| 70 |
+
# Function to load the FastText model from Hugging Face
|
| 71 |
+
@st.cache_resource
|
| 72 |
+
def load_fasttext_model(model_dir):
|
| 73 |
+
model_path = os.path.join(model_dir, "fasttext_model.model")
|
| 74 |
+
vectors_path = os.path.join(model_dir, "fasttext_model_vectors.kv")
|
| 75 |
+
vectors_ngrams_path = os.path.join(model_dir, "fasttext_model.model.wv.vectors_ngrams.npy")
|
| 76 |
+
|
| 77 |
+
model = FastText.load(model_path)
|
| 78 |
+
model.wv = KeyedVectors.load(vectors_path, mmap='r')
|
| 79 |
+
model.wv.vectors_ngrams = np.load(vectors_ngrams_path, mmap_mode='r')
|
| 80 |
+
|
| 81 |
+
return model
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# Function to generate embeddings for a given word
|
| 85 |
+
def generate_word_embedding(word, model):
|
| 86 |
+
return model.wv.get_vector(word, norm=True) if word in model.wv else None
|
| 87 |
+
|
| 88 |
+
# Function to find similar words
|
| 89 |
+
def find_similar_words(word, model, topn=5):
|
| 90 |
+
return model.wv.most_similar(word, topn=topn) if word in model.wv else []
|
| 91 |
+
|
| 92 |
+
# Function to tokenize a sentence using the given pattern
|
| 93 |
+
def tokenize_sentence(sentence, pattern):
|
| 94 |
+
tokens = re.findall(pattern, sentence)
|
| 95 |
+
return [token.strip() for token in tokens if token.strip()]
|
| 96 |
+
|
| 97 |
+
# Function to generate embeddings for words in a sentence
|
| 98 |
+
def generate_embeddings_for_sentence(sentence, model, pattern):
|
| 99 |
+
tokens = tokenize_sentence(sentence, pattern)
|
| 100 |
+
embeddings = []
|
| 101 |
+
for token in tokens:
|
| 102 |
+
if token in model.wv:
|
| 103 |
+
embeddings.append(model.wv[token])
|
| 104 |
+
return embeddings
|
| 105 |
+
|
| 106 |
+
# Function to generate embedding for a sentence
|
| 107 |
+
def generate_sentence_embedding(sentence, model, pattern):
|
| 108 |
+
word_embeddings = generate_embeddings_for_sentence(sentence, model, pattern)
|
| 109 |
+
if not word_embeddings:
|
| 110 |
+
return None
|
| 111 |
+
return np.mean(word_embeddings, axis=0)
|
| 112 |
+
|
| 113 |
+
# Function to generate embeddings for sentences
|
| 114 |
+
def generate_sentence_embeddings(sentences, model, pattern):
|
| 115 |
+
return [generate_sentence_embedding(sentence, model, pattern) for sentence in sentences]
|
| 116 |
+
|
| 117 |
# Streamlit app
|
| 118 |
def main():
|
| 119 |
st.title("Text Processing and FastText Word Embedding Trainer")
|
| 120 |
|
| 121 |
# Sidebar options
|
| 122 |
st.sidebar.title("Options")
|
| 123 |
+
option = st.sidebar.radio("Select an option", ("Clean Dataset", "Train Word Embedding", "Generate Embeddings"))
|
| 124 |
+
|
| 125 |
if option == "Clean Dataset":
|
| 126 |
st.header("Clean Text Dataset")
|
| 127 |
|
|
|
|
| 202 |
except Exception as e:
|
| 203 |
st.error(f"An error occurred: {str(e)}")
|
| 204 |
st.error("Check the server logs for more details.")
|
| 205 |
+
|
| 206 |
+
elif option == "Generate Embeddings":
|
| 207 |
+
st.header("Generate Embeddings with Pretrained FastText Model")
|
| 208 |
+
|
| 209 |
+
repo_id = "Blessmore/Fasttext_embeddings/Fast_text_50_dim"
|
| 210 |
+
model_file = "fasttext_model.model"
|
| 211 |
+
vectors_file = "fasttext_model_vectors.kv"
|
| 212 |
+
vectors_ngrams_file = "fasttext_model.model.wv.vectors_ngrams.npy"
|
| 213 |
+
|
| 214 |
+
model = load_fasttext_model(repo_id, model_file, vectors_file, vectors_ngrams_file)
|
| 215 |
+
|
| 216 |
+
st.subheader("Generate Word Embedding")
|
| 217 |
+
word = st.text_input("Enter a word:")
|
| 218 |
+
if word:
|
| 219 |
+
embedding = generate_word_embedding(word, model)
|
| 220 |
+
if embedding is not None:
|
| 221 |
+
st.write(f"Embedding for '{word}':", embedding)
|
| 222 |
+
else:
|
| 223 |
+
st.write(f"'{word}' not in vocabulary")
|
| 224 |
+
|
| 225 |
+
st.subheader("Find Similar Words")
|
| 226 |
+
word_for_similar = st.text_input("Enter a word to find similar words:")
|
| 227 |
+
if word_for_similar:
|
| 228 |
+
similar_words = find_similar_words(word_for_similar, model)
|
| 229 |
+
if similar_words:
|
| 230 |
+
st.write("Similar words:")
|
| 231 |
+
for word, similarity in similar_words:
|
| 232 |
+
st.write(f"{word}: {similarity}")
|
| 233 |
+
else:
|
| 234 |
+
st.write(f"No similar words found for '{word_for_similar}'")
|
| 235 |
+
|
| 236 |
+
st.subheader("Generate Embeddings for Words in a Sentence")
|
| 237 |
+
sentence = st.text_input("Enter a sentence:")
|
| 238 |
+
if sentence:
|
| 239 |
+
word_embeddings = generate_embeddings_for_sentence(sentence, model, r'\b\w+\b')
|
| 240 |
+
if word_embeddings:
|
| 241 |
+
for idx, embedding in enumerate(word_embeddings):
|
| 242 |
+
st.write(f"Word {idx+1} embedding:", embedding)
|
| 243 |
+
else:
|
| 244 |
+
st.write("No embeddings could be generated for the words in the sentence.")
|
| 245 |
+
|
| 246 |
+
st.subheader("Generate Embedding for a Sentence")
|
| 247 |
+
sentence_for_embedding = st.text_input("Enter a sentence to generate its embedding:")
|
| 248 |
+
if sentence_for_embedding:
|
| 249 |
+
sentence_embedding = generate_sentence_embedding(sentence_for_embedding, model, r'\b\w+\b')
|
| 250 |
+
if sentence_embedding is not None:
|
| 251 |
+
st.write("Sentence embedding:", sentence_embedding)
|
| 252 |
+
else:
|
| 253 |
+
st.write("No embedding could be generated for the sentence.")
|
| 254 |
+
|
| 255 |
+
st.subheader("Find Most Similar Sentence Pairs")
|
| 256 |
+
uploaded_sentences_file = st.file_uploader("Upload a text file with sentences (one per line)", type=["txt"])
|
| 257 |
+
if uploaded_sentences_file:
|
| 258 |
+
sentences = uploaded_sentences_file.read().decode('utf-8').splitlines()
|
| 259 |
+
sentence_embeddings = generate_sentence_embeddings(sentences, model, r'\b\w+\b')
|
| 260 |
+
sentence_pairs = []
|
| 261 |
+
for i in range(len(sentences)):
|
| 262 |
+
for j in range(i + 1, len(sentences)):
|
| 263 |
+
if sentence_embeddings[i] is not None and sentence_embeddings[j] is not None:
|
| 264 |
+
similarity = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[j]])[0][0]
|
| 265 |
+
sentence_pairs.append((sentences[i], sentences[j], similarity))
|
| 266 |
+
sentence_pairs = sorted(sentence_pairs, key=lambda x: x[2], reverse=True)
|
| 267 |
+
st.write("Most similar sentence pairs:")
|
| 268 |
+
for sent1, sent2, sim in sentence_pairs[:5]:
|
| 269 |
+
st.write(f"Sentence 1: {sent1}")
|
| 270 |
+
st.write(f"Sentence 2: {sent2}")
|
| 271 |
+
st.write(f"Similarity: {sim}")
|
| 272 |
+
st.write("-----")
|
| 273 |
|
| 274 |
if __name__ == "__main__":
|
| 275 |
main()
|