File size: 15,216 Bytes
9d5856c
f851013
9d5856c
 
 
 
 
 
 
 
0548d05
f851013
87f215d
9d5856c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d86944b
9d5856c
 
0548d05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2105cf1
 
9dc4bba
 
2105cf1
 
f851013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a6299f
f851013
 
 
 
 
 
 
0a6299f
f851013
 
 
 
 
87f215d
 
3146cd6
 
 
87f215d
 
 
 
 
 
 
 
 
 
04e377b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a6299f
87f215d
04e377b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3308997
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87f215d
f4a3689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87f215d
9d5856c
 
0548d05
87f215d
d86944b
 
 
87f215d
f4a3689
 
 
 
 
9d5856c
d86944b
f4a3689
 
 
 
d86944b
0548d05
87f215d
f851013
0548d05
 
 
 
 
 
 
 
 
 
d86944b
0548d05
 
 
87f215d
0548d05
 
 
 
 
 
 
 
 
d86944b
e4ee49a
 
 
 
 
d86944b
e4ee49a
 
 
87f215d
e4ee49a
 
d86944b
87f215d
 
 
 
 
 
 
 
 
 
 
 
 
 
f4a3689
 
e4ee49a
 
 
b57d43a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import streamlit as st
from gensim.models import FastText, KeyedVectors
import re
from gensim.utils import simple_preprocess
import time
import os
import zipfile
import io
import tempfile
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics.pairwise import cosine_similarity
from transformers import MarianTokenizer, MarianMTModel

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return simple_preprocess(text)

# Function to read and preprocess the corpus from an uploaded file
def read_corpus(file):
    for line in file:
        yield preprocess_text(line.decode('utf-8'))

# Function to zip the model files in memory
def zip_model(model):
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
        with tempfile.TemporaryDirectory() as temp_dir:
            model.save(os.path.join(temp_dir, "fasttext_model.model"))
            model.wv.save(os.path.join(temp_dir, "fasttext_model_vectors.kv"))
            np.save(os.path.join(temp_dir, "fasttext_model.model.wv.vectors_ngrams.npy"), model.wv.vectors_ngrams)
            np.save(os.path.join(temp_dir, "fasttext_model_vectors.kv.vectors_ngrams.npy"), model.wv.vectors_ngrams)
            for root, dirs, files in os.walk(temp_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, start=temp_dir)
                    zipf.write(file_path, arcname=arcname)
    zip_buffer.seek(0)
    return zip_buffer

# Function to clean a chunk of text
def clean_text_chunk(chunk):
    lines = chunk.split('\n')
    cleaned_lines = []
    for line in lines:
        if len(re.findall(r'\b\w+\b', line.lower())) >= 5:
            tokens = re.findall(r'\b\w+\b', line.lower())
            cleaned_line = ' '.join(tokens)
            cleaned_line = re.sub(r'[^a-zA-Z\s]', '', cleaned_line)
            cleaned_lines.append(cleaned_line)
    return '\n'.join(cleaned_lines)

# Function to clean text using multithreading
def clean_text_multithreaded(text):
    chunks = text.split('\n\n')
    with ThreadPoolExecutor() as executor:
        cleaned_chunks = list(executor.map(clean_text_chunk, chunks))
    return '\n'.join(cleaned_chunks)

# Function to load the FastText model from the specified folder
def load_fasttext_model(model_folder):
    model_file = os.path.join(model_folder, "shona_fasttext_50d.model")
    vectors_file = os.path.join(model_folder, "shona_fasttext_vectors_50d.kv")
    model = FastText.load(model_file)
    model.wv = KeyedVectors.load(vectors_file)
    return model

# Function to generate embeddings for a given word
def generate_word_embedding(word, model):
    return model.wv.get_vector(word, norm=True) if word in model.wv else None

# Function to find similar words
def find_similar_words(word, model, topn=5):
    return model.wv.most_similar(word, topn=topn) if word in model.wv else []

# Function to tokenize a sentence using the given pattern
def tokenize_sentence(sentence, pattern):
    tokens = re.findall(pattern, sentence)
    return [token.strip() for token in tokens if token.strip()]

# Function to generate embeddings for words in a sentence
def generate_embeddings_for_sentence(sentence, model, pattern):
    tokens = tokenize_sentence(sentence, pattern)
    embeddings = []
    for token in tokens:
        if token in model.wv:
            embeddings.append((token, model.wv[token]))
    return embeddings

# Function to generate embedding for a sentence
def generate_sentence_embedding(sentence, model, pattern):
    word_embeddings = generate_embeddings_for_sentence(sentence, model, pattern)
    if not word_embeddings:
        return None
    return np.mean([embedding for _, embedding in word_embeddings], axis=0)

# Function to generate embeddings for sentences
def generate_sentence_embeddings(sentences, model, pattern):
    return [generate_sentence_embedding(sentence, model, pattern) for sentence in sentences]

# Function to load the translation model and tokenizer
def load_translation_model(model_folder):
    model_path = os.path.join(model_folder)
    tokenizer = MarianTokenizer.from_pretrained(model_path)
    model = MarianMTModel.from_pretrained(model_path)
    return tokenizer, model

# Function to perform translation
def translate_text(text, tokenizer, model):
    inputs = tokenizer.encode(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(inputs, max_length=512)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# Pages for Generate Embeddings
def generate_word_embedding_page(model):
    st.subheader("Generate Word Embedding")
    word = st.text_input("Enter a word:")
    if word:
        embedding = generate_word_embedding(word, model)
        if embedding is not None:
            st.write(f"Embedding for '{word}':", embedding)
        else:
            st.write(f"'{word}' not in vocabulary")

def find_similar_words_page(model):
    st.subheader("Find Similar Words")
    word_for_similar = st.text_input("Enter a word to find similar words:")
    if word_for_similar:
        similar_words = find_similar_words(word_for_similar, model)
        if similar_words:
            st.write("Similar words:")
            for word, similarity in similar_words:
                st.write(f"{word}: {similarity}")
        else:
            st.write(f"No similar words found for '{word_for_similar}'")

def generate_embeddings_for_sentence_page(model):
    st.subheader("Generate Embeddings for Words in a Sentence")
    sentence = st.text_input("Enter a sentence:")
    if sentence:
        word_embeddings = generate_embeddings_for_sentence(sentence, model, r'\b\w+\b')
        if word_embeddings:
            for word, embedding in word_embeddings:
                st.write(f"'{word}' embedding:", embedding)
        else:
            st.write("No embeddings could be generated for the words in the sentence.")

def generate_sentence_embedding_page(model):
    st.subheader("Generate Embedding for a Sentence")
    sentence_for_embedding = st.text_input("Enter a sentence to generate its embedding:")
    if sentence_for_embedding:
        sentence_embedding = generate_sentence_embedding(sentence_for_embedding, model, r'\b\w+\b')
        if sentence_embedding is not None:
            st.write("Sentence embedding:", sentence_embedding)
        else:
            st.write("No embedding could be generated for the sentence.")

def find_most_similar_sentence_pairs_page(model):
    st.subheader("Find Most Similar Sentence Pairs")
    uploaded_sentences_file = st.file_uploader("Upload a text file with sentences (one per line)", type=["txt"])
    if uploaded_sentences_file:
        sentences = uploaded_sentences_file.read().decode('utf-8').splitlines()
        sentence_embeddings = generate_sentence_embeddings(sentences, model, r'\b\w+\b')
        sentence_pairs = []
        for i in range(len(sentences)):
            for j in range(i + 1, len(sentences)):
                if sentence_embeddings[i] is not None and sentence_embeddings[j] is not None:
                    similarity = cosine_similarity([sentence_embeddings[i]], [sentence_embeddings[j]])[0][0]
                    sentence_pairs.append((sentences[i], sentences[j], similarity))
        sentence_pairs = sorted(sentence_pairs, key=lambda x: x[2], reverse=True)
        st.write("Most similar sentence pairs:")
        for sent1, sent2, sim in sentence_pairs[:5]:
            st.write(f"Sentence 1: {sent1}")
            st.write(f"Sentence 2: {sent2}")
            st.write(f"Similarity: {sim}")
            st.write("-----")

# Page to search similar information from the document
def search_similar_information_page(model):
    st.subheader("Search Similar Information from Document")
    uploaded_file = st.file_uploader("Upload a text file", type=["txt"])
    if uploaded_file:
        document_text = uploaded_file.read().decode('utf-8').splitlines()
        document_sentences = [line for line in document_text if line.strip()]
        if document_sentences:
            search_query = st.text_input("Enter search query:")
            if search_query:
                query_embedding = generate_sentence_embedding(search_query, model, r'\b\w+\b')
                if query_embedding is not None:
                    document_embeddings = generate_sentence_embeddings(document_sentences, model, r'\b\w+\b')
                    similarities = [
                        (sentence, cosine_similarity([query_embedding], [embedding])[0][0])
                        for sentence, embedding in zip(document_sentences, document_embeddings)
                        if embedding is not None
                    ]
                    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
                    st.write("Most similar sentences in the document:")
                    for sentence, sim in similarities[:5]:
                        st.write(f"Sentence: {sentence}")
                        st.write(f"Similarity: {sim}")
                        st.write("-----")
                else:
                    st.write("No embedding could be generated for the search query.")

# Page for translation
def translate_text_page(tokenizer_shona_to_english, model_shona_to_english, tokenizer_english_to_shona, model_english_to_shona):
    st.subheader("Translate Text")
    translation_direction = st.radio("Select translation direction", ("Shona to English", "English to Shona"))

    if translation_direction == "Shona to English":
        text_to_translate = st.text_area("Enter Shona text to translate:")
        if text_to_translate:
            translated_text = translate_text(text_to_translate, tokenizer_shona_to_english, model_shona_to_english)
            st.write("Translated text:")
            st.write(translated_text)
    else:
        text_to_translate = st.text_area("Enter English text to translate:")
        if text_to_translate:
            translated_text = translate_text(text_to_translate, tokenizer_english_to_shona, model_english_to_shona)
            st.write("Translated text:")
            st.write(translated_text)

# Streamlit app
def main():
    st.title("Text Processing and FastText Word Embedding Trainer")

    # Load models if not already loaded
    if 'fasttext_model' not in st.session_state:
        st.session_state['fasttext_model'] = load_fasttext_model("Fast_text_50_dim")
    
    if 'translation_model_shona_to_english' not in st.session_state:
        st.session_state['tokenizer_shona_to_english'], st.session_state['translation_model_shona_to_english'] = load_translation_model("fine_tuned_shona_to_english_model")
    
    if 'translation_model_english_to_shona' not in st.session_state:
        st.session_state['tokenizer_english_to_shona'], st.session_state['translation_model_english_to_shona'] = load_translation_model("english_shona_model")
    
    fasttext_model = st.session_state['fasttext_model']
    tokenizer_shona_to_english = st.session_state['tokenizer_shona_to_english']
    translation_model_shona_to_english = st.session_state['translation_model_shona_to_english']
    tokenizer_english_to_shona = st.session_state['tokenizer_english_to_shona']
    translation_model_english_to_shona = st.session_state['translation_model_english_to_shona']

    st.sidebar.title("Options")
    option = st.sidebar.radio("Select an option", ("Clean Dataset", "Train Word Embedding", "Generate Embeddings", "Translate Text"))

    if option == "Clean Dataset":
        st.header("Clean Text Dataset")
        uploaded_file = st.file_uploader("Upload Raw Text File", type=["txt"])
        if uploaded_file is not None:
            raw_text = uploaded_file.read().decode('utf-8')
            if st.button("Clean Dataset"):
                try:
                    cleaned_text = clean_text_multithreaded(raw_text)
                    st.write("Dataset cleaned successfully!")
                    cleaned_file = io.BytesIO(cleaned_text.encode('utf-8'))
                    st.download_button(label="Download Cleaned Dataset", data=cleaned_file, file_name="cleaned_dataset.txt", mime="text/plain")
                except Exception as e:
                    st.error(f"An error occurred: {str(e)}")
                    st.error("Check the server logs for more details.")

    elif option == "Train Word Embedding":
        st.header("Train FastText Word Embedding")
        uploaded_file = st.file_uploader("Upload Cleaned Text File", type=["txt"])
        if uploaded_file is not None:
            vector_size = st.number_input("Select Embedding Dimensions", min_value=10, max_value=500, value=50, step=10)
            if st.button("Train FastText Model"):
                try:
                    sentences = list(read_corpus(uploaded_file))
                    start_time = time.time()
                    model = FastText(sentences, vector_size=vector_size, window=7, min_count=5, workers=4, sg=1, epochs=100, bucket=2000000, min_n=3, max_n=6)
                    end_time = time.time()
                    elapsed_time = end_time - start_time
                    st.write("Time taken: {:.2f} minutes".format(elapsed_time / 60))
                    st.write("Model trained successfully!")
                    zip_buffer = zip_model(model)
                    st.download_button(label="Download Model", data=zip_buffer, file_name="fasttext_model.zip", mime="application/zip")
                except Exception as e:
                    st.error(f"An error occurred: {str(e)}")
                    st.error("Check the server logs for more details.")

    elif option == "Generate Embeddings":
        st.header("Generate Embeddings with Pretrained FastText Model")
        embedding_option = st.sidebar.selectbox("Select an embedding operation", ("Generate Word Embedding", "Find Similar Words", "Generate Embeddings for Words in a Sentence", "Generate Embedding for a Sentence", "Find Most Similar Sentence Pairs", "Search Similar Information"))
        if embedding_option == "Generate Word Embedding":
            generate_word_embedding_page(fasttext_model)
        elif embedding_option == "Find Similar Words":
            find_similar_words_page(fasttext_model)
        elif embedding_option == "Generate Embeddings for Words in a Sentence":
            generate_embeddings_for_sentence_page(fasttext_model)
        elif embedding_option == "Generate Embedding for a Sentence":
            generate_sentence_embedding_page(fasttext_model)
        elif embedding_option == "Find Most Similar Sentence Pairs":
            find_most_similar_sentence_pairs_page(fasttext_model)
        elif embedding_option == "Search Similar Information":
            search_similar_information_page(fasttext_model)

    elif option == "Translate Text":
        st.header("Translate Text")
        translate_text_page(tokenizer_shona_to_english, translation_model_shona_to_english, tokenizer_english_to_shona, translation_model_english_to_shona)

if __name__ == "__main__":
    main()