Update app.py
Browse files
app.py
CHANGED
|
@@ -53,7 +53,7 @@ if selected_method == "Bag of Words":
|
|
| 53 |
st.subheader("Bag of Words (BoW)")
|
| 54 |
st.write("""
|
| 55 |
**Definition**: Represents text as a collection of word counts, ignoring grammar and word order.
|
| 56 |
-
|
| 57 |
**Uses**:
|
| 58 |
- Sentiment analysis
|
| 59 |
- Document classification
|
|
@@ -67,10 +67,6 @@ if selected_method == "Bag of Words":
|
|
| 67 |
β Ignores word order and context
|
| 68 |
β High-dimensionality for large vocabularies
|
| 69 |
""")
|
| 70 |
-
vectorizer = CountVectorizer()
|
| 71 |
-
X_bow = vectorizer.fit_transform(texts)
|
| 72 |
-
st.write("Feature Names:", vectorizer.get_feature_names_out())
|
| 73 |
-
st.write("Bag of Words Representation:", X_bow.toarray())
|
| 74 |
|
| 75 |
elif selected_method == "TF-IDF":
|
| 76 |
st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
|
|
@@ -90,10 +86,6 @@ elif selected_method == "TF-IDF":
|
|
| 90 |
β Still ignores word order
|
| 91 |
β Does not capture deep semantics
|
| 92 |
""")
|
| 93 |
-
tfidf_vectorizer = TfidfVectorizer()
|
| 94 |
-
X_tfidf = tfidf_vectorizer.fit_transform(texts)
|
| 95 |
-
st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
|
| 96 |
-
st.write("TF-IDF Representation:", X_tfidf.toarray())
|
| 97 |
|
| 98 |
elif selected_method == "One-Hot Encoding":
|
| 99 |
st.subheader("One-Hot Encoding")
|
|
@@ -112,10 +104,6 @@ elif selected_method == "One-Hot Encoding":
|
|
| 112 |
β Inefficient for large vocabularies
|
| 113 |
β No information on word meaning
|
| 114 |
""")
|
| 115 |
-
one_hot_vectorizer = CountVectorizer(binary=True)
|
| 116 |
-
X_one_hot = one_hot_vectorizer.fit_transform(texts)
|
| 117 |
-
st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
|
| 118 |
-
st.write("One-Hot Encoding Representation:", X_one_hot.toarray())
|
| 119 |
|
| 120 |
elif selected_method == "Word Embeddings (Word2Vec)":
|
| 121 |
st.subheader("Word Embeddings (Word2Vec)")
|
|
@@ -135,14 +123,6 @@ elif selected_method == "Word Embeddings (Word2Vec)":
|
|
| 135 |
β Requires large datasets to train
|
| 136 |
β Computationally expensive
|
| 137 |
""")
|
| 138 |
-
model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
|
| 139 |
-
word_vectors = model.wv
|
| 140 |
-
word = 'natural'
|
| 141 |
-
if word in word_vectors:
|
| 142 |
-
st.write(f"Word2Vec Representation of '{word}':")
|
| 143 |
-
st.write(word_vectors[word])
|
| 144 |
-
else:
|
| 145 |
-
st.write(f"Word '{word}' not found in the vocabulary.")
|
| 146 |
|
| 147 |
# Footer
|
| 148 |
st.write("---")
|
|
|
|
| 53 |
st.subheader("Bag of Words (BoW)")
|
| 54 |
st.write("""
|
| 55 |
**Definition**: Represents text as a collection of word counts, ignoring grammar and word order.
|
| 56 |
+
|
| 57 |
**Uses**:
|
| 58 |
- Sentiment analysis
|
| 59 |
- Document classification
|
|
|
|
| 67 |
β Ignores word order and context
|
| 68 |
β High-dimensionality for large vocabularies
|
| 69 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
elif selected_method == "TF-IDF":
|
| 72 |
st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
|
|
|
|
| 86 |
β Still ignores word order
|
| 87 |
β Does not capture deep semantics
|
| 88 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
elif selected_method == "One-Hot Encoding":
|
| 91 |
st.subheader("One-Hot Encoding")
|
|
|
|
| 104 |
β Inefficient for large vocabularies
|
| 105 |
β No information on word meaning
|
| 106 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
elif selected_method == "Word Embeddings (Word2Vec)":
|
| 109 |
st.subheader("Word Embeddings (Word2Vec)")
|
|
|
|
| 123 |
β Requires large datasets to train
|
| 124 |
β Computationally expensive
|
| 125 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
# Footer
|
| 128 |
st.write("---")
|