Harika22 commited on
Commit
1a1805f
·
verified ·
1 Parent(s): 79fc281

Update pages/6_Feature_Engineering.py

Browse files
Files changed (1) hide show
  1. pages/6_Feature_Engineering.py +20 -1
pages/6_Feature_Engineering.py CHANGED
@@ -353,6 +353,25 @@ elif file_type == "Bag of Words(BOW)":
353
  cv = CountVectorizer(lowercase=True,strip_accents="unicode",analyzer="word",stop_words=stp,token_pattern=r"((?u)\b\w\w+\b))")
354
  cv.fit(corpus["Review"]) ### learning vocabulary
355
  vector = cv.transform(corpus["Review"]) ### it converts into vector form based on cv and vocabulary learned
 
 
356
  vector.toarray()
357
 
358
- ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  cv = CountVectorizer(lowercase=True,strip_accents="unicode",analyzer="word",stop_words=stp,token_pattern=r"((?u)\b\w\w+\b))")
354
  cv.fit(corpus["Review"]) ### learning vocabulary
355
  vector = cv.transform(corpus["Review"]) ### it converts into vector form based on cv and vocabulary learned
356
+ cv.get_feature_names_out()
357
+ cv.vocabulary_
358
  vector.toarray()
359
 
360
+ ''')
361
+
362
+ st.header("Binary Bag of Words(BBOW)")
363
+ st.markdown('''
364
+ - Extension of Bag of Words(BOW) is Binary Bag of Words(BBOW)
365
+ ''')
366
+
367
+ st.markdown("""
368
+ ### 🛠️ Steps in Bag of Words(BOW):
369
+ - Create a vocabulary (set of unique words)
370
+ - Each document is converted into vector form(d-dimension)
371
+ - In bag of words the value is count , but in binary bag of words it tells whether the word is preseent or not
372
+ - So, that it is way more easier to find the distance between vectors (here distance is nothing but no.of unique words)
373
+ - If the unique words are more --> distance is high
374
+ - Calculation of distance will be way more faster than bag of words
375
+ - distance is total no.of unique words between two documents
376
+ """)
377
+