Blessmore commited on
Commit
9d5856c
·
verified ·
1 Parent(s): afbb14e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from gensim.models import FastText
3
+ import re
4
+ from gensim.utils import simple_preprocess
5
+ import time
6
+ import os
7
+ import zipfile
8
+ import io
9
+ import tempfile
10
+ import numpy as np
11
+
12
+ # Function to preprocess text
13
+ def preprocess_text(text):
14
+ text = text.lower() # Lowercase
15
+ text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
16
+ return simple_preprocess(text)
17
+
18
+ # Function to read and preprocess the corpus from an uploaded file
19
+ def read_corpus(file):
20
+ for line in file:
21
+ yield preprocess_text(line.decode('utf-8'))
22
+
23
+ # Function to zip the model files in memory
24
+ def zip_model(model):
25
+ # Create a BytesIO object to hold the zip file in memory
26
+ zip_buffer = io.BytesIO()
27
+
28
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
29
+ # Save the model to a temporary directory
30
+ with tempfile.TemporaryDirectory() as temp_dir:
31
+ model.save(os.path.join(temp_dir, "fasttext_model.model"))
32
+ model.wv.save(os.path.join(temp_dir, "fasttext_model_vectors.kv"))
33
+
34
+ # Explicitly save vectors and ngrams if needed
35
+ np.save(os.path.join(temp_dir, "fasttext_model.model.wv.vectors_ngrams.npy"), model.wv.vectors_ngrams)
36
+ np.save(os.path.join(temp_dir, "fasttext_model_vectors.kv.vectors_ngrams.npy"), model.wv.vectors_ngrams)
37
+
38
+ # Zip all files in the temp_dir
39
+ for root, dirs, files in os.walk(temp_dir):
40
+ for file in files:
41
+ file_path = os.path.join(root, file)
42
+ arcname = os.path.relpath(file_path, start=temp_dir)
43
+ zipf.write(file_path, arcname=arcname)
44
+
45
+ zip_buffer.seek(0) # Rewind the buffer
46
+ return zip_buffer
47
+
48
+ # Streamlit app
49
+ def main():
50
+ st.title("FastText Word Embedding Trainer")
51
+
52
+ # Upload cleaned text data
53
+ uploaded_file = st.file_uploader("Upload Cleaned Text File", type=["txt"])
54
+
55
+ if uploaded_file is not None:
56
+ # Select embedding dimensions
57
+ vector_size = st.number_input("Select Embedding Dimensions", min_value=10, max_value=500, value=50, step=10)
58
+
59
+ # Train button
60
+ if st.button("Train FastText Model"):
61
+ try:
62
+ # Read and preprocess the corpus
63
+ sentences = list(read_corpus(uploaded_file))
64
+
65
+ # Train FastText model
66
+ start_time = time.time()
67
+ model = FastText(
68
+ sentences,
69
+ vector_size=vector_size,
70
+ window=7,
71
+ min_count=5,
72
+ workers=4,
73
+ sg=1,
74
+ epochs=100,
75
+ bucket=2000000,
76
+ min_n=3,
77
+ max_n=6
78
+ )
79
+ end_time = time.time()
80
+
81
+ # Calculate the elapsed time
82
+ elapsed_time = end_time - start_time
83
+ st.write("Time taken: {:.2f} minutes".format(elapsed_time / 60))
84
+
85
+ st.write("Model trained successfully!")
86
+
87
+ # Zip the model files in memory
88
+ zip_buffer = zip_model(model)
89
+
90
+ # Provide download link
91
+ st.download_button(
92
+ label="Download Model",
93
+ data=zip_buffer,
94
+ file_name="fasttext_model.zip",
95
+ mime="application/zip"
96
+ )
97
+ except Exception as e:
98
+ st.error(f"An error occurred: {str(e)}")
99
+ st.error("Check the server logs for more details.")
100
+
101
+ if __name__ == "__main__":
102
+ main()