Blessmore's picture
Update app.py
b57d43a verified
import streamlit as st
from gensim.models import FastText, KeyedVectors
import re
from gensim.utils import simple_preprocess
import time
import os
import zipfile
import io
import tempfile
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics.pairwise import cosine_similarity
from transformers import MarianTokenizer, MarianMTModel
# Function to preprocess text
def preprocess_text(text):
text = text.lower() # Lowercase
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
return simple_preprocess(text)
# Function to read and preprocess the corpus from an uploaded file
def read_corpus(file):
for line in file:
yield preprocess_text(line.decode('utf-8'))
# Function to zip the model files in memory
def zip_model(model):
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
with tempfile.TemporaryDirectory() as temp_dir:
model.save(os.path.join(temp_dir, "fasttext_model.model"))
model.wv.save(os.path.join(temp_dir, "fasttext_model_vectors.kv"))
np.save(os.path.join(temp_dir, "fasttext_model.model.wv.vectors_ngrams.npy"), model.wv.vectors_ngrams)
np.save(os.path.join(temp_dir, "fasttext_model_vectors.kv.vectors_ngrams.npy"), model.wv.vectors_ngrams)
for root, dirs, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, start=temp_dir)
zipf.write(file_path, arcname=arcname)
zip_buffer.seek(0)
return zip_buffer
# Function to clean a chunk of text
def clean_text_chunk(chunk):
lines = chunk.split('\n')
cleaned_lines = []
for line in lines:
if len(re.findall(r'\b\w+\b', line.lower())) >= 5:
tokens = re.findall(r'\b\w+\b', line.lower())
cleaned_line = ' '.join(tokens)
cleaned_line = re.sub(r'[^a-zA-Z\s]', '', cleaned_line)
cleaned_lines.append(cleaned_line)
return '\n'.join(cleaned_lines)
# Function to clean text using multithreading
def clean_text_multithreaded(text):
chunks = text.split('\n\n')
with ThreadPoolExecutor() as executor:
cleaned_chunks = list(executor.map(clean_text_chunk, chunks))
return '\n'.join(cleaned_chunks)
# Function to load the FastText model from the specified folder
def load_fasttext_model(model_folder):
model_file = os.path.join(model_folder, "shona_fasttext_50d.model")
vectors_file = os.path.join(model_folder, "shona_fasttext_vectors_50d.kv")
model = FastText.load(model_file)
model.wv = KeyedVectors.load(vectors_file)
return model
# Function to generate embeddings for a given word
def generate_word_embedding(word, model):
return model.wv.get_vector(word, norm=True) if word in model.wv else None
# Function to find similar words
def find_similar_words(word, model, topn=5):
return model.wv.most_similar(word, topn=topn) if word in model.wv else []
# Function to tokenize a sentence using the given pattern
def tokenize_sentence(sentence, pattern):
tokens = re.findall(pattern, sentence)
return [token.strip() for token in tokens if token.strip()]
# Function to generate embeddings for words in a sentence
def generate_embeddings_for_sentence(sentence, model, pattern):
tokens = tokenize_sentence(sentence, pattern)
embeddings = []
for token in tokens:
if token in model.wv:
embeddings.append((token, model.wv[token]))
return embeddings
# Function to generate embedding for a sentence
def generate_sentence_embedding(sentence, model, pattern):
word_embeddings = generate_embeddings_for_sentence(sentence, model, pattern)
if not word_embeddings:
return None
return np.mean([embedding for _, embedding in word_embeddings], axis=0)
# Function to generate embeddings for sentences
def generate_sentence_embeddings(sentences, model, pattern):
return [generate_sentence_embedding(sentence, model, pattern) for sentence in sentences]
# Function to load the translation model and tokenizer
def load_translation_model(model_folder):
model_path = os.path.join(model_folder)
tokenizer = MarianTokenizer.from_pretrained(model_path)
model = MarianMTModel.from_pretrained(model_path)
return tokenizer, model
# Function to perform translation
def translate_text(text, tokenizer, model):
inputs = tokenizer.encode(text, return_tensors="pt", padding=True)
translated_tokens = model.generate(inputs, max_length=512)
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
return translated_text
# Pages for Generate Embeddings
def generate_word_embedding_page(model):
st.subheader("Generate Word Embedding")
word = st.text_input("Enter a word:")
if word:
embedding = generate_word_embedding(word, model)
if embedding is not None:
st.write(f"Embedding for '{word}':", embedding)
else:
st.write(f"'{word}' not in vocabulary")
def find_similar_words_page(model):
st.subheader("Find Similar Words")
word_for_similar = st.text_input("Enter a word to find similar words:")
if word_for_similar:
similar_words = find_similar_words(word_for_similar, model)
if similar_words:
st.write("Similar words:")
for word, similarity in similar_words:
st.write(f"{word}: {similarity}")
else:
st.write(f"No similar words found for '{word_for_similar}'")
def generate_embeddings_for_sentence_page(model):
st.subheader("Generate Embeddings for Words in a Sentence")
sentence = st.text_input("Enter a sentence:")
if sentence:
word_embeddings = generate_embeddings_for_sentence(sentence, model, r'\b\w+\b')
if word_embeddings:
for word, embedding in word_embeddings:
st.write(f"'{word}' embedding:", embedding)
else:
st.write("No embeddings could be generated for the words in the sentence.")
def generate_sentence_embedding_page(model):
st.subheader("Generate Embedding for a Sentence")
sentence_for_embedding = st.text_input("Enter a sentence to generate its embedding:")
if sentence_for_embedding:
sentence_embedding = generate_sentence_embedding(sentence_for_embedding, model, r'\b\w+\b')
if sentence_embedding is not None:
st.write("Sentence embedding:", sentence_embedding)
else:
st.write("No embedding could be generated for the sentence.")
def find_most_similar_sentence_pairs_page(model):
st.subheader("Find Most Similar Sentence Pairs")
uploaded_sentences_file = st.file_uploader("Upload a text file with sentences (one per line)", type=["txt"])
if uploaded_sentences_file:
sentences = uploaded_sentences_file.read().decode('utf-8').splitlines()
sentence_embeddings = generate_sentence_embeddings(sentences, model, r'\b\w+\b')
sentence_pairs = []
for i in range(len(sentences)):
for j in range(i + 1, len(sentences)):
if sentence_embeddings[i] is not None and sentence_embeddings[j] is not None:
similarity = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[j]])[0][0]
sentence_pairs.append((sentences[i], sentences[j], similarity))
sentence_pairs = sorted(sentence_pairs, key=lambda x: x[2], reverse=True)
st.write("Most similar sentence pairs:")
for sent1, sent2, sim in sentence_pairs[:5]:
st.write(f"Sentence 1: {sent1}")
st.write(f"Sentence 2: {sent2}")
st.write(f"Similarity: {sim}")
st.write("-----")
# Page to search similar information from the document
def search_similar_information_page(model):
st.subheader("Search Similar Information from Document")
uploaded_file = st.file_uploader("Upload a text file", type=["txt"])
if uploaded_file:
document_text = uploaded_file.read().decode('utf-8').splitlines()
document_sentences = [line for line in document_text if line.strip()]
if document_sentences:
search_query = st.text_input("Enter search query:")
if search_query:
query_embedding = generate_sentence_embedding(search_query, model, r'\b\w+\b')
if query_embedding is not None:
document_embeddings = generate_sentence_embeddings(document_sentences, model, r'\b\w+\b')
similarities = [
(sentence, cosine_similarity([query_embedding], [embedding])[0][0])
for sentence, embedding in zip(document_sentences, document_embeddings)
if embedding is not None
]
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
st.write("Most similar sentences in the document:")
for sentence, sim in similarities[:5]:
st.write(f"Sentence: {sentence}")
st.write(f"Similarity: {sim}")
st.write("-----")
else:
st.write("No embedding could be generated for the search query.")
# Page for translation
def translate_text_page(tokenizer_shona_to_english, model_shona_to_english, tokenizer_english_to_shona, model_english_to_shona):
st.subheader("Translate Text")
translation_direction = st.radio("Select translation direction", ("Shona to English", "English to Shona"))
if translation_direction == "Shona to English":
text_to_translate = st.text_area("Enter Shona text to translate:")
if text_to_translate:
translated_text = translate_text(text_to_translate, tokenizer_shona_to_english, model_shona_to_english)
st.write("Translated text:")
st.write(translated_text)
else:
text_to_translate = st.text_area("Enter English text to translate:")
if text_to_translate:
translated_text = translate_text(text_to_translate, tokenizer_english_to_shona, model_english_to_shona)
st.write("Translated text:")
st.write(translated_text)
# Streamlit app
def main():
st.title("Text Processing and FastText Word Embedding Trainer")
# Load models if not already loaded
if 'fasttext_model' not in st.session_state:
st.session_state['fasttext_model'] = load_fasttext_model("Fast_text_50_dim")
if 'translation_model_shona_to_english' not in st.session_state:
st.session_state['tokenizer_shona_to_english'], st.session_state['translation_model_shona_to_english'] = load_translation_model("fine_tuned_shona_to_english_model")
if 'translation_model_english_to_shona' not in st.session_state:
st.session_state['tokenizer_english_to_shona'], st.session_state['translation_model_english_to_shona'] = load_translation_model("english_shona_model")
fasttext_model = st.session_state['fasttext_model']
tokenizer_shona_to_english = st.session_state['tokenizer_shona_to_english']
translation_model_shona_to_english = st.session_state['translation_model_shona_to_english']
tokenizer_english_to_shona = st.session_state['tokenizer_english_to_shona']
translation_model_english_to_shona = st.session_state['translation_model_english_to_shona']
st.sidebar.title("Options")
option = st.sidebar.radio("Select an option", ("Clean Dataset", "Train Word Embedding", "Generate Embeddings", "Translate Text"))
if option == "Clean Dataset":
st.header("Clean Text Dataset")
uploaded_file = st.file_uploader("Upload Raw Text File", type=["txt"])
if uploaded_file is not None:
raw_text = uploaded_file.read().decode('utf-8')
if st.button("Clean Dataset"):
try:
cleaned_text = clean_text_multithreaded(raw_text)
st.write("Dataset cleaned successfully!")
cleaned_file = io.BytesIO(cleaned_text.encode('utf-8'))
st.download_button(label="Download Cleaned Dataset", data=cleaned_file, file_name="cleaned_dataset.txt", mime="text/plain")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
st.error("Check the server logs for more details.")
elif option == "Train Word Embedding":
st.header("Train FastText Word Embedding")
uploaded_file = st.file_uploader("Upload Cleaned Text File", type=["txt"])
if uploaded_file is not None:
vector_size = st.number_input("Select Embedding Dimensions", min_value=10, max_value=500, value=50, step=10)
if st.button("Train FastText Model"):
try:
sentences = list(read_corpus(uploaded_file))
start_time = time.time()
model = FastText(sentences, vector_size=vector_size, window=7, min_count=5, workers=4, sg=1, epochs=100, bucket=2000000, min_n=3, max_n=6)
end_time = time.time()
elapsed_time = end_time - start_time
st.write("Time taken: {:.2f} minutes".format(elapsed_time / 60))
st.write("Model trained successfully!")
zip_buffer = zip_model(model)
st.download_button(label="Download Model", data=zip_buffer, file_name="fasttext_model.zip", mime="application/zip")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
st.error("Check the server logs for more details.")
elif option == "Generate Embeddings":
st.header("Generate Embeddings with Pretrained FastText Model")
embedding_option = st.sidebar.selectbox("Select an embedding operation", ("Generate Word Embedding", "Find Similar Words", "Generate Embeddings for Words in a Sentence", "Generate Embedding for a Sentence", "Find Most Similar Sentence Pairs", "Search Similar Information"))
if embedding_option == "Generate Word Embedding":
generate_word_embedding_page(fasttext_model)
elif embedding_option == "Find Similar Words":
find_similar_words_page(fasttext_model)
elif embedding_option == "Generate Embeddings for Words in a Sentence":
generate_embeddings_for_sentence_page(fasttext_model)
elif embedding_option == "Generate Embedding for a Sentence":
generate_sentence_embedding_page(fasttext_model)
elif embedding_option == "Find Most Similar Sentence Pairs":
find_most_similar_sentence_pairs_page(fasttext_model)
elif embedding_option == "Search Similar Information":
search_similar_information_page(fasttext_model)
elif option == "Translate Text":
st.header("Translate Text")
translate_text_page(tokenizer_shona_to_english, translation_model_shona_to_english, tokenizer_english_to_shona, translation_model_english_to_shona)
if __name__ == "__main__":
main()