Harika22 commited on
Commit
47fbe17
·
verified ·
1 Parent(s): 058b5ee

Update pages/6_Feature_Engineering.py

Browse files
Files changed (1) hide show
  1. pages/6_Feature_Engineering.py +61 -1
pages/6_Feature_Engineering.py CHANGED
@@ -237,4 +237,64 @@ if file_type == "One-Hot Vectorization":
237
  st.subheader(":blue[Lack of Sequential Information]")
238
  st.markdown('''
239
  - Sequential information is not preserved
240
- ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  st.subheader(":blue[Lack of Sequential Information]")
238
  st.markdown('''
239
  - Sequential information is not preserved
240
+ ''')
241
+
242
+
243
+ elif file_type == "Bag of Words(BOW)":
244
+ st.title(":red[Bag of Words(BOW)]")
245
+ st.markdown("""
246
+ ### 📌 What is Bag of Words(BOW)?
247
+ - It is a type of vectorization technique where text is converted into a numerical vector.
248
+ - To overcome the problem of different document length(can't convert into tabular data) BOW is implemented.
249
+ """)
250
+
251
+ st.markdown("""
252
+ ### 🛠️ Steps in Bag of Words(BOW):
253
+ - Create a Vocabulary ➡️ (A set of all unique words in the collected corpus).
254
+ - Find the Length of Vocabulary ➡️ (Total number of unique words = d-dimensions).
255
+ - Each document is converted into vector which is in d- dimension
256
+ - Every dimeension belongs to a unique word
257
+ - Bag of Words is actually interested in how many times the word is occuring
258
+ - If the two documents are same they will find out a similarity based on same words repeating in 2 different documents
259
+ - By converting into documents into vectors we can concatenate all vectors to form tabular data
260
+ - where roes are documents and columns represent features which are unique words
261
+ - Every dimension value will be count
262
+ - how many times the word is occuring in document
263
+ """)
264
+ st.markdown(
265
+ "<div class='corpus-box'>"
266
+ "<strong>Document 1:</strong> I love cricket I <br>"
267
+ "<strong>Document 2:</strong> I hate cricket <br>"
268
+ "<strong>Document 3:</strong> I like cricket"
269
+ "</div>",
270
+ unsafe_allow_html=True,
271
+ )
272
+
273
+ st.subheader(":green[Unique Words (Vocabulary)]")
274
+ st.markdown(
275
+ "<p class='content'>The set of unique words in our corpus is: <strong>{I, love, cricket, hate, like}</strong>. "
276
+ "This set forms the vocabulary, and the number of unique words determines the vector dimensions.</p>",
277
+ unsafe_allow_html=True,
278
+ )
279
+
280
+ st.subheader(":green[Word Count Representation]")
281
+ st.markdown(
282
+ "<p class='content'>Each document is converted into a numerical vector by counting the occurrences of words "
283
+ "from the vocabulary within each document.</p>",
284
+ unsafe_allow_html=True,
285
+ )
286
+
287
+ st.markdown(
288
+ "<div class='vector-box'><strong>Vector Representation:</strong><br>"
289
+ "Document 1 ➝ [2,1,1,0,0] (I = 2, love = 1, cricket = 1, hate = 0, like = 0)<br>"
290
+ "Document 2 ➝ [1,0,1,1,0] (I = 1, love = 0, cricket = 1, hate = 1, like = 0)<br>"
291
+ "Document 3 ➝ [1,0,1,0,1] (I = 1, love = 0, cricket = 1, hate = 0, like = 1)"
292
+ "</div>",
293
+ unsafe_allow_html=True,
294
+ )
295
+
296
+ st.subheader(":green[Tabular Representation]")
297
+ st.markdown(
298
+ "<p class='content'>Since all three vectors have the same number of dimensions, we can merge them into a tabular format:</p>",
299
+ unsafe_allow_html=True,
300
+ )