Harika22 commited on
Commit
79fc281
Β·
verified Β·
1 Parent(s): 47fbe17

Update pages/6_Feature_Engineering.py

Browse files
Files changed (1) hide show
  1. pages/6_Feature_Engineering.py +60 -2
pages/6_Feature_Engineering.py CHANGED
@@ -205,8 +205,8 @@ if file_type == "One-Hot Vectorization":
205
 
206
  st.subheader(":blue[Sparsity]")
207
  st.markdown('''
208
- - The vector which is created using one-hhot vectorization gives sparse vector
209
- - Entire data is given to any alogorithm and machine is going to learn fom data and algorithm it is biasd towards zero values as the data is sparse data
210
  - This issue in ML is known as overfitting
211
  - It is solved in Deep learning
212
  ''')
@@ -298,3 +298,61 @@ elif file_type == "Bag of Words(BOW)":
298
  "<p class='content'>Since all three vectors have the same number of dimensions, we can merge them into a tabular format:</p>",
299
  unsafe_allow_html=True,
300
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  st.subheader(":blue[Sparsity]")
207
  st.markdown('''
208
+ - The vector which is created using one-hot vectorization gives sparse vector
209
+ - Entire data is given to any alogorithm and machine is going to learn fom data and algorithm it is baised towards zero values as the data is sparse data
210
  - This issue in ML is known as overfitting
211
  - It is solved in Deep learning
212
  ''')
 
298
  "<p class='content'>Since all three vectors have the same number of dimensions, we can merge them into a tabular format:</p>",
299
  unsafe_allow_html=True,
300
  )
301
+
302
+ st.subheader(":red[Advantages]")
303
+ st.markdown('''
304
+ - Bag of Words(BOW) is easy to implement
305
+ - Here we can convert the data into tabular data
306
+ ''')
307
+
308
+ st.subheader(":red[Disadvantages]")
309
+
310
+ st.subheader(":blue[Curse of Dimensionality]")
311
+ st.markdown('''
312
+ - Document increases ↑ Vocabulary ↑ and vector increases ↑ dimensionality also increases ↑
313
+ - Ml performance decreases ↓ - as the dimensionality totally depends on vocabulary and it shootup as the document increases and different
314
+ - As the corpus increases , vocabulary increases -- dimensionality increses
315
+ ''')
316
+
317
+
318
+ st.subheader(":blue[Sparsity]")
319
+ st.markdown('''
320
+ - The vector which is created using BOW gives sparse vector
321
+ - Entire data is given to any alogorithm and machine is going to learn fom data and algorithm it is baised towards zero values as the data is sparse data
322
+ - This issue in ML is known as overfitting
323
+ - It is solved in Deep learning
324
+ ''')
325
+
326
+ st.subheader(":blue[Out of Vocabulary Issue]")
327
+ st.markdown('''
328
+ - Document only converted during training time and we're giving our own dataset
329
+ - If the word is not present in our dataset while training it can't convert into vector format results in key error
330
+ - This is solved by Fasttext
331
+ ''')
332
+
333
+ st.subheader(":blue[Inability to Preserve Semantic Meaning]")
334
+ st.markdown('''
335
+ - It can't completely preserve semantic meaning (slightly preserves it)
336
+ - Here based on count(no.of times the particular word is occuring) it can sometimes preserve semantic meaning
337
+ - Based on uniqueness of the words the semantic meaning is preserved
338
+ - More the uniqueness , more the documents will be far away
339
+ - Less no.of unique words , it'll be close to each other
340
+ ''')
341
+
342
+ st.subheader(":blue[Lack of Sequential Information]")
343
+ st.markdown('''
344
+ - Sequential information is not preserved
345
+ ''')
346
+
347
+ st.code('''
348
+ from sklearn.feature_extraction.text import CountVectorizer
349
+
350
+ corpus = pd.DataFrame({"Review":["biryani is is is good","biryani is not good","biryani is too costly"]})
351
+ ## object of the CountVectorizer class
352
+
353
+ cv = CountVectorizer(lowercase=True,strip_accents="unicode",analyzer="word",stop_words=stp,token_pattern=r"((?u)\b\w\w+\b))")
354
+ cv.fit(corpus["Review"]) ### learning vocabulary
355
+ vector = cv.transform(corpus["Review"]) ### it converts into vector form based on cv and vocabulary learned
356
+ vector.toarray()
357
+
358
+ ''')