Update pages/6_Feature_Engineering.py
Browse files
pages/6_Feature_Engineering.py
CHANGED
|
@@ -353,6 +353,25 @@ elif file_type == "Bag of Words(BOW)":
|
|
| 353 |
cv = CountVectorizer(lowercase=True,strip_accents="unicode",analyzer="word",stop_words=stp,token_pattern=r"((?u)\b\w\w+\b))")
|
| 354 |
cv.fit(corpus["Review"]) ### learning vocabulary
|
| 355 |
vector = cv.transform(corpus["Review"]) ### it converts into vector form based on cv and vocabulary learned
|
|
|
|
|
|
|
| 356 |
vector.toarray()
|
| 357 |
|
| 358 |
-
''')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
cv = CountVectorizer(lowercase=True,strip_accents="unicode",analyzer="word",stop_words=stp,token_pattern=r"((?u)\b\w\w+\b))")
|
| 354 |
cv.fit(corpus["Review"]) ### learning vocabulary
|
| 355 |
vector = cv.transform(corpus["Review"]) ### it converts into vector form based on cv and vocabulary learned
|
| 356 |
+
cv.get_feature_names_out()
|
| 357 |
+
cv.vocabulary_
|
| 358 |
vector.toarray()
|
| 359 |
|
| 360 |
+
''')
|
| 361 |
+
|
| 362 |
+
st.header("Binary Bag of Words(BBOW)")
|
| 363 |
+
st.markdown('''
|
| 364 |
+
- Extension of Bag of Words(BOW) is Binary Bag of Words(BBOW)
|
| 365 |
+
''')
|
| 366 |
+
|
| 367 |
+
st.markdown("""
|
| 368 |
+
### 🛠️ Steps in Bag of Words(BOW):
|
| 369 |
+
- Create a vocabulary (set of unique words)
|
| 370 |
+
- Each document is converted into vector form(d-dimension)
|
| 371 |
+
- In bag of words the value is count , but in binary bag of words it tells whether the word is preseent or not
|
| 372 |
+
- So, that it is way more easier to find the distance between vectors (here distance is nothing but no.of unique words)
|
| 373 |
+
- If the unique words are more --> distance is high
|
| 374 |
+
- Calculation of distance will be way more faster than bag of words
|
| 375 |
+
- distance is total no.of unique words between two documents
|
| 376 |
+
""")
|
| 377 |
+
|