Spaces:
Sleeping
Sleeping
Commit
·
4634560
1
Parent(s):
d40f814
new update
Browse files
app.py
CHANGED
|
@@ -18,38 +18,26 @@ lemmatizer = WordNetLemmatizer()
|
|
| 18 |
|
| 19 |
# Function to preprocess input text
|
| 20 |
def preprocess_text(input_text, word2vec_model):
|
| 21 |
-
# Convert to lowercase
|
| 22 |
input_text = input_text.lower()
|
| 23 |
-
|
| 24 |
-
# Tokenize words
|
| 25 |
tokens = input_text.split()
|
| 26 |
-
|
| 27 |
-
# Remove stop words
|
| 28 |
tokens = [token for token in tokens if token not in stop_words]
|
| 29 |
-
|
| 30 |
-
# Lemmatize tokens
|
| 31 |
tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
|
| 32 |
-
|
| 33 |
-
# Generate Word2Vec embeddings for tokens
|
| 34 |
embeddings = []
|
|
|
|
| 35 |
for token in tokens:
|
| 36 |
if token in word2vec_model.wv:
|
| 37 |
embeddings.append(word2vec_model.wv[token])
|
| 38 |
else:
|
| 39 |
-
embeddings.append(np.zeros(word2vec_model.vector_size))
|
| 40 |
-
|
| 41 |
-
# Pad or truncate embeddings to match model's time_steps
|
| 42 |
-
time_steps = lstm_model.input_shape[1] # Dynamically get time_steps from model
|
| 43 |
-
vector_size = word2vec_model.vector_size
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
|
|
|
| 47 |
else:
|
| 48 |
-
padding = [np.zeros(vector_size)] * (
|
| 49 |
embeddings.extend(padding)
|
| 50 |
|
| 51 |
-
|
| 52 |
-
input_features = np.array(embeddings).reshape((1, time_steps, vector_size))
|
| 53 |
return input_features
|
| 54 |
|
| 55 |
# Load Word2Vec model
|
|
|
|
| 18 |
|
| 19 |
# Function to preprocess input text
|
| 20 |
def preprocess_text(input_text, word2vec_model):
|
|
|
|
| 21 |
input_text = input_text.lower()
|
|
|
|
|
|
|
| 22 |
tokens = input_text.split()
|
|
|
|
|
|
|
| 23 |
tokens = [token for token in tokens if token not in stop_words]
|
|
|
|
|
|
|
| 24 |
tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]
|
|
|
|
|
|
|
| 25 |
embeddings = []
|
| 26 |
+
|
| 27 |
for token in tokens:
|
| 28 |
if token in word2vec_model.wv:
|
| 29 |
embeddings.append(word2vec_model.wv[token])
|
| 30 |
else:
|
| 31 |
+
embeddings.append(np.zeros(word2vec_model.vector_size))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
max_timesteps = 100
|
| 34 |
+
if len(embeddings) > max_timesteps:
|
| 35 |
+
embeddings = embeddings[:max_timesteps]
|
| 36 |
else:
|
| 37 |
+
padding = [np.zeros(word2vec_model.vector_size)] * (max_timesteps - len(embeddings))
|
| 38 |
embeddings.extend(padding)
|
| 39 |
|
| 40 |
+
input_features = np.array(embeddings).reshape((1, max_timesteps, word2vec_model.vector_size))
|
|
|
|
| 41 |
return input_features
|
| 42 |
|
| 43 |
# Load Word2Vec model
|