File size: 27,721 Bytes
a4a298f
f7c5f07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d019295
 
 
 
 
 
 
 
6ac7719
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d019295
 
 
 
 
 
 
 
 
 
f7c5f07
 
 
 
 
 
 
 
 
b3b48bc
f7c5f07
b3b48bc
 
 
d4a7a6d
 
 
b3b48bc
 
 
 
f7c5f07
d4a7a6d
b3b48bc
 
 
d4a7a6d
b3b48bc
d4a7a6d
 
 
b3b48bc
 
 
 
 
f7c5f07
d4a7a6d
b3b48bc
 
 
d4a7a6d
 
b3b48bc
 
 
 
f7c5f07
25fab03
5554831
 
 
 
 
25fab03
5554831
 
 
 
 
 
 
 
 
 
25fab03
 
 
 
 
e92ee8b
25fab03
b8bcf0b
e92ee8b
9408831
 
 
 
 
 
 
 
dfceb96
 
 
 
 
 
 
 
9408831
 
 
dfceb96
 
 
9408831
e92ee8b
e4f9e2d
 
 
 
 
 
 
 
 
 
 
9408831
 
 
e4f9e2d
dfceb96
e4f9e2d
 
 
dfceb96
 
 
 
e4f9e2d
39864a0
c0f6e9a
9c5b037
 
 
c0f6e9a
 
aea589c
058b5ee
 
 
 
 
c0f6e9a
df9cb07
058b5ee
79fc281
 
058b5ee
 
 
c0f6e9a
df9cb07
058b5ee
 
 
 
c0f6e9a
df9cb07
058b5ee
 
 
 
 
c0f6e9a
f0a49ff
058b5ee
 
 
 
 
 
 
 
c0f6e9a
df9cb07
058b5ee
 
47fbe17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79fc281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a1805f
 
79fc281
 
1a1805f
 
 
 
 
 
 
 
7007a94
1a1805f
 
 
 
 
 
 
 
7007a94
 
 
 
d019295
6ac7719
d019295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9c5382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f300d27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747b9ef
 
 
 
 
183160b
747b9ef
 
 
 
 
 
 
 
 
 
 
183160b
747b9ef
 
 
 
 
 
 
 
 
 
 
183160b
747b9ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f300d27
ea2f020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
import streamlit as st

st.markdown("""
    <style>
    /* Set a soft background color */
    body {
        background-color: #eef2f7;
    }
    /* Style for main title */
    h1 {
        color: black;
        font-family: 'Roboto', sans-serif;
        font-weight: 700;
        text-align: center;
        margin-bottom: 25px;
    }
    /* Style for headers */
    h2 {
        color: black;
        font-family: 'Roboto', sans-serif;
        font-weight: 600;
        margin-top: 30px;
    }
    
    /* Style for subheaders */
     h3 {
        color: red;
        font-family: 'Roboto', sans-serif;
        font-weight: 500;
        margin-top: 20px;
    }
    .custom-subheader {
        color: black;
        font-family: 'Roboto', sans-serif;
        font-weight: 600;
        margin-bottom: 15px;
    }
    /* Paragraph styling */
    p {
        font-family: 'Georgia', serif;
        line-height: 1.8;
        color: black;
        margin-bottom: 20px;
    }
    /* List styling with checkmark bullets */
    .icon-bullet {
        list-style-type: none;
        padding-left: 20px;
    }
    .icon-bullet li {
        font-family: 'Georgia', serif;
        font-size: 1.1em;
        margin-bottom: 10px;
        color: black;
    }
    .icon-bullet li::before {
        content: "β—†";
        padding-right: 10px;
        color: black;
    }
    /* Sidebar styling */
    .sidebar .sidebar-content {
        background-color: #ffffff;
        border-radius: 10px;
        padding: 15px;
    }
    .sidebar h2 {
        color: #495057;
    }
    .step-box {
        font-size: 18px;
        background-color: #F0F8FF;
        padding: 15px;
        border-radius: 10px;
        box-shadow: 2px 2px 8px #D3D3D3;
        line-height: 1.6;
    }
    .box {
        font-size: 18px;
        background-color: #F0F8FF;
        padding: 15px;
        border-radius: 10px;
        box-shadow: 2px 2px 8px #D3D3D3;
        line-height: 1.6;
    }

    .title {
        font-size: 26px;
        font-weight: bold;
        color: #E63946;
        text-align: center;
        margin-bottom: 15px;
    }
    .formula {
        font-size: 20px;
        font-weight: bold;
        color: #2A9D8F;
        background-color: #F7F7F7;
        padding: 10px;
        border-radius: 5px;
        text-align: center;
        margin-top: 10px;
    }
    /* Custom button style */
    .streamlit-button {
        background-color: #00FFFF;
        color: #000000;
        font-weight: bold;
    }
    </style>
    """, unsafe_allow_html=True)

st.markdown("<h1 class='header-title'>πŸ› οΈ Feature Engineering πŸ“Œ</h1>", unsafe_allow_html=True)

st.markdown(
    """
    <div class='info-box'>
        <p>πŸ”Ή When we take existing features from collected data and create new useful features, where this is automatically engineered made from existing features and the technique of creating the features is known as <span class='highlight'>Feature Engineering</span>.</p>
        <p> These engineered features enhance machine learning models.</p>
        <p> A subpart of feature engineering is Feature Extraction.</p>
    </div>
    """,
    unsafe_allow_html=True
)

st.subheader(":violet[Feature Extraxtion]")
st.markdown(
    """
    <div class='info-box'>
        <p>πŸ“ Feature Extraction is the process where text data which is natural language is given to machine to understand the natural language.</p>
        <ul>
            <li>Text is converted into vectors using specific algorithms.</li>
            <li>Preserving meaningful information is key.</li>
            <li>Helps in better text analysis & machine learning</li>
        </ul>
    </div>
    """,
    unsafe_allow_html=True
)

st.header("Vectorization🧭")
st.markdown(
    """
    <div class='info-box'>
        <p>Vectorization is the process of converting text into vector.</p>
        <p>This allows ML models to process text data effectively.</p>
    </div>
    """,
    unsafe_allow_html=True
)

st.subheader(":violet[Vectorization techniques]")
st.markdown("""
    There a different techniques to convert text into vector format.They are :
    <ul class="icon-bullet">
        <li>One-Hot Vectorization </li>
        <li>Bag of Words(BOW) </li>
        <li>Term Frequency - Inverse Document Frequency(TF-IDF)</li>
    </ul>
""", unsafe_allow_html=True)

st.markdown("""
    There are advance vectorization techniques.They are :
    <ul class="icon-bullet">
        <li>Word Embedding </li>
            <li>Word2Vec </li>
            <li>Fasttext</li>
    </ul>
""", unsafe_allow_html=True) 

st.sidebar.title("Navigation 🧭")
file_type = st.sidebar.radio(
    "Choose a Vectorization technique :",
    ("One-Hot Vectorization", "Bag of Words(BOW)", "Term Frequency - Inverse Document Frequency(TF-IDF)"))

if file_type == "One-Hot Vectorization":
    st.title(":red[One-Hot Vectorization]")
    st.markdown("""
        ### πŸ“Œ What is One-Hot Vectorization?
        -  It is a type of vectorization technique where text is converted into a numerical vector.
        -  This technique helps in representing words as unique vectors for machine learning models.
    """)

    st.markdown("""
        ### πŸ› οΈ Steps in One-Hot Vectorization:
         - Create a Vocabulary ➑️ (A set of all unique words in the collected corpus).  
         - Find the Length of Vocabulary ➑️ (Total number of unique words = d-dimensions).  
         - Convert Each Word into a Vector:
           -  Every unique word is transformed into a vector.
           -  Each vector has d-dimensions, where each dimension corresponds to a unique word.
           -  Words are converted individually, and then combined to form a vector.

         This technique ensures that each word is treated uniquely and efficiently in NLP tasks.
        """)

    st.markdown("""
        -  Each word gets a unique vector representation.
        -  The number of dimensions = total vocabulary size.
        -  Words are vectorized separately, then combined into document vectors.
    """)

    st.markdown("""
        | **Word** | **Vector Representation** |
        |----------|--------------------------|
        |  **toy**  | [1,0,0,0,0] |
        |  **is**   | [0,1,0,0,0] |
        |  **good** | [0,0,1,0,0] |
        |  **not**  | [0,0,0,1,0] |
        |  **bad**  | [0,0,0,0,1] |
    """, unsafe_allow_html=True)

    st.markdown("""
        - d₁ β†’ v₁ β†’ `[[1,0,0,0,0] , [0,1,0,0,0] , [0,0,1,0,0]]`  
        - dβ‚‚ β†’ vβ‚‚ β†’ `[[1,0,0,0,0] , [0,1,0,0,0] , [0,0,0,1,0] , [0,0,1,0,0]]`  
        - d₃ β†’ v₃ β†’ `[[0,0,0,0,1], [1,0,0,0,0]]`  

     This One-Hot Vectorization technique converts words into numerical vectors while preserving their uniqueness.  
    """)

    st.markdown("""
        ###  Key Takeaways:
        -  Each word is represented as a 5-dimensional vector.  
        -  Every dimension corresponds to a unique word in the vocabulary.  
        -  This method is useful for transforming text into a numerical format for Machine Learning tasks.
    """)

    st.subheader(":red[Advantages]")
    st.markdown('''
    - One-Hot Vectorization is easy to implement
    ''')
    st.subheader(":red[Disadvantages]")

    st.subheader(":blue[Different Document Length]")
    st.markdown('''
    - 1.Every document have different no.of words (here we're not converting document to vector , we're converting word to vector)
        - We can't convert into tabular data
        - It would be possible to convert into tabular data when we're converting document into vector(this is solved by Bag of Words(BOW))
    ''')

    st.subheader(":blue[Sparsity]")
    st.markdown('''
        - The vector which is created using one-hot vectorization gives sparse vector 
        - Entire data is given to any alogorithm and machine is going to learn fom data and algorithm it is baised towards zero values as the data is sparse data
        - This issue in ML is known as overfitting
        - It is solved in Deep learning
    ''')

    st.subheader(":blue[Curse of Dimensionality]")
    st.markdown('''
        - Document increases ↑ Vocabulary ↑ and vector increases ↑ dimensionality also increases ↑
        - Ml performance decreases ↓ - as the dimensionality totally depends on vocabulary and it shootup as the document increases and different
    ''')

    st.subheader(":blue[Out of Vocabulary Issue]")
    st.markdown('''
        - Document only converted during training time and we're giving our own dataset
        - If the word is not present in our dataset while training it can't convert into vector format results in key error
        - This is solved by Fasttext
    ''')

    st.subheader(":blue[Inability to Preserve Semantic Meaning]")
    st.markdown('''
        - While converting text β†’ vector format (same relationship should be preserved)
        - We need to convert document into vector in such a way that semantic relationship should be preserved 
        - Similarity ⬆️ and Distance ⬇️
        - Similarity ∝ 1 / Distance
        - Distance between vectors should be very small
        - If this is satisfied then the technique has good semantic meaning
    ''')

    st.subheader(":blue[Lack of Sequential Information]")
    st.markdown('''
        - Sequential information is not preserved 
    ''')


elif file_type == "Bag of Words(BOW)":
    st.title(":red[Bag of Words(BOW)]")
    st.markdown("""
        ### πŸ“Œ What is Bag of Words(BOW)?
        -  It is a type of vectorization technique where text is converted into a numerical vector.
        -  To overcome the problem of different document length(can't convert into tabular data) BOW is implemented.
    """)

    st.markdown("""
        ### πŸ› οΈ Steps in Bag of Words(BOW):
         - Create a Vocabulary ➑️ (A set of all unique words in the collected corpus).  
         - Find the Length of Vocabulary ➑️ (Total number of unique words = d-dimensions).  
             - Each document is converted into vector which is in d- dimension
             - Every dimeension belongs to a unique word
         - Bag of Words is actually interested in how many times the word is occuring
         - If the two documents are same they will find out a similarity based on same words repeating in 2 different documents
         - By converting into documents into vectors we can concatenate all vectors to form tabular data 
             - where roes are documents and columns represent features which are unique words
             - Every dimension value will be count
             - how many times the word is occuring in document
        """)
    st.markdown(
    "<div class='corpus-box'>"
    "<strong>Document 1:</strong> I love cricket I <br>"
    "<strong>Document 2:</strong> I hate cricket <br>"
    "<strong>Document 3:</strong> I like cricket"
    "</div>",
    unsafe_allow_html=True,
    )

    st.subheader(":green[Unique Words (Vocabulary)]")
    st.markdown(
    "<p class='content'>The set of unique words in our corpus is: <strong>{I, love, cricket, hate, like}</strong>. "
    "This set forms the vocabulary, and the number of unique words determines the vector dimensions.</p>",
    unsafe_allow_html=True,
    )

    st.subheader(":green[Word Count Representation]")
    st.markdown(
    "<p class='content'>Each document is converted into a numerical vector by counting the occurrences of words "
    "from the vocabulary within each document.</p>",
    unsafe_allow_html=True,
    )

    st.markdown(
    "<div class='vector-box'><strong>Vector Representation:</strong><br>"
    "Document 1 ➝ [2,1,1,0,0] (I = 2, love = 1, cricket = 1, hate = 0, like = 0)<br>"
    "Document 2 ➝ [1,0,1,1,0] (I = 1, love = 0, cricket = 1, hate = 1, like = 0)<br>"
    "Document 3 ➝ [1,0,1,0,1] (I = 1, love = 0, cricket = 1, hate = 0, like = 1)"
    "</div>",
    unsafe_allow_html=True,
    )

    st.subheader(":green[Tabular Representation]")
    st.markdown(
    "<p class='content'>Since all three vectors have the same number of dimensions, we can merge them into a tabular format:</p>",
    unsafe_allow_html=True,
    )

    st.subheader(":red[Advantages]")
    st.markdown('''
    - Bag of Words(BOW) is easy to implement
    - Here we can convert the data into tabular data
    ''')

    st.subheader(":red[Disadvantages]")

    st.subheader(":blue[Curse of Dimensionality]")
    st.markdown('''
        - Document increases ↑ Vocabulary ↑ and vector increases ↑ dimensionality also increases ↑
        - Ml performance decreases ↓ - as the dimensionality totally depends on vocabulary and it shootup as the document increases and different
        - As the corpus increases , vocabulary increases -- dimensionality increses
    ''')

    
    st.subheader(":blue[Sparsity]")
    st.markdown('''
        - The vector which is created using BOW gives sparse vector 
        - Entire data is given to any alogorithm and machine is going to learn fom data and algorithm it is baised towards zero values as the data is sparse data
        - This issue in ML is known as overfitting
        - It is solved in Deep learning
    ''')

    st.subheader(":blue[Out of Vocabulary Issue]")
    st.markdown('''
        - Document only converted during training time and we're giving our own dataset
        - If the word is not present in our dataset while training it can't convert into vector format results in key error
        - This is solved by Fasttext
    ''')

    st.subheader(":blue[Inability to Preserve Semantic Meaning]")
    st.markdown('''
    - It can't completely preserve semantic meaning (slightly preserves it)
    - Here based on count(no.of times the particular word is occuring) it can sometimes preserve semantic meaning
    - Based on uniqueness of the words the semantic meaning is preserved
    - More the uniqueness , more the documents will be far away
    - Less no.of unique words , it'll be close to each other
    ''')

    st.subheader(":blue[Lack of Sequential Information]")
    st.markdown('''
        - Sequential information is not preserved 
    ''')

    st.code('''
            from sklearn.feature_extraction.text import CountVectorizer

            corpus = pd.DataFrame({"Review":["biryani is is is good","biryani is not good","biryani is too costly"]})
            ## object of the CountVectorizer class

            cv = CountVectorizer(lowercase=True,strip_accents="unicode",analyzer="word",stop_words=stp,token_pattern=r"((?u)\b\w\w+\b))")
            cv.fit(corpus["Review"])  ### learning vocabulary
            vector = cv.transform(corpus["Review"]) ### it converts into vector form based on cv and vocabulary learned
            cv.get_feature_names_out()
            cv.vocabulary_
            vector.toarray()
            
    ''')

    st.header("Binary Bag of Words(BBOW)")
    st.markdown('''
    - Extension of Bag of Words(BOW) is Binary Bag of Words(BBOW)
    ''')

    st.markdown("""
        ### πŸ› οΈ Steps in Binary Bag of Words(BBOW):
        - Create a vocabulary (set of unique words)
        - Each document is converted into vector form(d-dimension)
        - In bag of words the value is count , but in binary bag of words it tells whether the word is preseent or not
        - So, that it is way more easier to find the distance between vectors (here distance is nothing but no.of unique words)
        - If the unique words are more --> distance is high
        - Calculation of distance will be way more faster than bag of words 
            - distance is total no.of unique words between two documents
    """)


elif file_type == "Term Frequency - Inverse Document Frequency(TF-IDF)":
    st.title(":red[Term Frequency - Inverse Document Frequency(TF-IDF)]")
    st.markdown("""
        ### πŸ“Œ What is TF-IDF ?
        -  It is a type of vectorization technique where text is converted into a numerical vector.
    """)

    st.subheader(":violet[πŸ› οΈ Steps in TF-IDF]")

    st.markdown(
    """
        <ul>
            <li><strong>Create a vocabulary:</strong> A set of unique words from the corpus.</li>
            <li><strong>Convert each document into a vector:</strong> A d-dimensional representation.</li>
            <li><strong>Calculate Term Frequency (TF):</strong> Measures the importance of a word within a document.</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown("<div class='formula'>TF(wα΅’, dα΅’) = (Occurrences of wα΅’ in dα΅’) / (Total words in dα΅’)</div>", unsafe_allow_html=True)

    st.markdown(
    """
        <ul>
            <li><strong>Compute Inverse Document Frequency (IDF):</strong> Measures how important a word is across all documents.</li>
            <li><strong>For every word in the vocabulary, apply IDF:</strong></li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown("<div class='formula'>IDF(wα΅’, C) = log(N/n)</div>", unsafe_allow_html=True)

    st.markdown(
    """
        - <strong>N:</strong> Total number of documents in the corpus.<br>
        - <strong>n:</strong> Number of documents containing the word wα΅’.<br>
        - TF-IDF helps in understanding word significance while reducing the impact of commonly used words.
    """,
    unsafe_allow_html=True,
    )
    st.markdown("<h1 class='title'>πŸ“Œ Example of TF-IDF</h1>", unsafe_allow_html=True)

    st.markdown(
    """
    <div class='box'>
        <strong>Given a corpus with 3 documents:</strong><br><br>
        <strong>d1:</strong> w1, w2, w3, w1 β†’ v1 <br>
        <strong>d2:</strong> w1, w2, w2, w3, w4, w2, w3 β†’ v2 <br>
        <strong>d3:</strong> w1, w5 β†’ v3 <br><br>
        <strong>Vocabulary:</strong> {w1, w2, w3, w4, w5} <br>
        <strong>Vocabulary Size:</strong> 5 (d-dimension)
    </div>
    """,
    unsafe_allow_html=True,
    )

    st.markdown("<h2 style='color: #6A0572;'>πŸ“Š Term Frequency (TF) Calculation</h2>", unsafe_allow_html=True)

    st.markdown(
    """
        <ul>
            <li>TF measures how often a word appears in a document.</li>
            <li>Formula: <span class='highlight'>TF(wα΅’, dα΅’) = (Occurrences of wα΅’ in dα΅’) / (Total words in dα΅’)</span></li>
            <li>TF values change based on the document.</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
    <div class='formula'>
        TF(w1, d1) = 2/4 = 0.5 <br>
        TF(w2, d1) = 1/4 = 0.25 <br>
        TF(w3, d1) = 1/4 = 0.25 <br>
        TF(w4, d1) = 0/4 = 0 <br>
        TF(w5, d1) = 0/4 = 0 <br>
    </div>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        <ul>
            <li>TF values always range from <strong>0 to 1</strong>.</li>
            <li>Case-1: <span class='highlight'>TF = 0</span> β†’ Word is not present in the document.</li>
            <li>Case-2: <span class='highlight'>TF = 1</span> β†’ Word is the only word in the document.</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown("<h2 style='color: #6A0572;'>πŸ“‰ Inverse Document Frequency (IDF) Calculation</h2>", unsafe_allow_html=True)

    st.markdown(
    """
        <ul>
            <li>IDF measures how important a word is across the entire corpus.</li>
            <li>Formula: <span class='highlight'>IDF(wα΅’, C) = log(N/n)</span></li>
            <li>N = Total number of documents.</li>
            <li>n = Number of documents containing wα΅’.</li>
            <li>IDF values range from <strong>0 to ∞</strong>.</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown("<h2 style='color: #6A0572;'>πŸ“Œ TF-IDF Calculation</h2>", unsafe_allow_html=True)

    st.markdown(
    """
        <ul>
            <li>We calculate TF-IDF by multiplying TF and IDF values.</li>
            <li>Formula: <span class='highlight'>TF-IDF = TF * IDF</span></li>
            <li>TF-IDF helps reduce the impact of frequent words while keeping rare words important.</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
    <div class='formula'>
        d1 β†’ v1 = [0, 0.04, 0.04, 0, 0] (TF * IDF values)
    </div>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        - The final TF-IDF values may be low, high, or even zero depending on term frequency and document frequency.
    """,
    unsafe_allow_html=True,
    )

    st.markdown("<h1 class='title'>πŸ“Œ TF-IDF Key Insights</h1>", unsafe_allow_html=True)

    st.markdown(
    """
        <h3 style='color: #6A0572;'>πŸ“ˆ Case 1: High TF-IDF Values</h3>
        <ul>
            <li>If the word appears <strong>frequently</strong> in a document β†’ <span class='highlight'>High TF-IDF</span></li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        <h3 style='color: #6A0572;'>πŸ“‰ Case 2: Low TF-IDF Values</h3>
        <ul>
            <li>If the word appears <strong>rarely</strong> in a document β†’ <span class='highlight'>Low TF-IDF</span></li>
            <li>TF is always in the range: <strong>[0 - 1]</strong></li>
            <li>IDF is in the range: <strong>[0 - ∞)</strong></li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        <h3 style='color: #6A0572;'>πŸ“Š Understanding TF (Term Frequency)</h3>
        <ul>
            <li>TF gives <strong>more importance</strong> to words that occur <strong>frequently</strong> in a document.</li>
            <li>As the word frequency <span class='highlight'>increases</span> β†’ TF <span class='highlight'>increases</span>.</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        <h3 style='color: #6A0572;'>πŸ“‰ Understanding IDF (Inverse Document Frequency)</h3>
        <ul>
            <li>IDF Formula: <span class='highlight'>IDF(wα΅’, C) = log(N/n)</span></li>
            <li><strong>N:</strong> Total number of documents</li>
            <li><strong>n:</strong> Number of documents containing the word</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
    <div class='formula'>
        <strong>When n is small:</strong> <br>
        - N/n increases β†’ log(N/n) increases ⬆️ <br>
        - Word is rare in the corpus β†’ Higher importance in IDF <br><br>
        <strong>When n is large:</strong> <br>
        - N/n decreases β†’ log(N/n) decreases ⬇️ <br>
        - Word is common β†’ Lower importance in IDF <br><br>
        <strong>When N = n:</strong> log(N/n) = 0 (word appears in every document)
    </div>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        <h3 style='color: #6A0572;'>πŸ“Œ TF-IDF Calculation</h3>
        <ul>
            <li><strong>TF</strong> focuses on words <strong>frequent</strong> in a document.</li>
            <li><strong>IDF</strong> focuses on words <strong>rare</strong> in the corpus.</li>
            <li><span class='highlight'>TF-IDF is high</span> for words that appear <strong>often in a document</strong> but <strong>rarely in the corpus</strong>.</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.subheader(":red[Why log is used]")
    st.markdown("<h1 class='title'>πŸ“Œ Understanding TF-IDF Scaling</h1>", unsafe_allow_html=True)

    st.markdown(
    """
        <h3 style='color: #6A0572;'> Minimum and Maximum Values of N/n</h3>
        <ul>
            <li>When <strong>n is maximum</strong> β†’ <span class='highlight'>N/n = 1</span></li>
            <li>At <strong>training time</strong>: <span class='highlight'>1 ≀ n ≀ N</span></li>
            <li>At <strong>test time</strong>: <span class='highlight'>0 ≀ n ≀ N</span> (due to Out-of-Vocabulary words)</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        <h3 style='color: #6A0572;'> IDF Dominance Over TF</h3>
        <ul>
            <li>If <strong>n decreases</strong> β†’ <span class='highlight'>N/n increases (max)</span></li>
            <li>TF scale is very <span class='highlight'>small</span>, but IDF scale is very <span class='highlight'>high</span></li>
            <li>IDF can <span class='highlight'>dominate</span> TF, favoring rare words over frequent ones</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
        <h3 style='color: #6A0572;'>How Log Solves IDF Dominance?</h3>
        <ul>
            <li>Applying <span class='highlight'>log</span> reduces the dominance of IDF</li>
            <li>Logarithm <span class='highlight'>rounds off</span> values to a balanced scale</li>
            <li>It prevents bias towards rare words and maintains proportionality</li>
        </ul>
    """,
    unsafe_allow_html=True,
    )

    st.markdown(
    """
    <div class='formula'>
        <strong>TF balances frequent words, while log(IDF) prevents rare-word dominance! πŸš€</strong>
    </div>
    """,
    unsafe_allow_html=True,
    )

    st.subheader(":red[Advantages]")
    st.markdown('''
    - Easy to implement
    - Can convert into tabular format
    - It gives importance to both frequently occuring word and rarely occuring in corpus
    ''')
    st.subheader(":red[Disadvantages]")
    
    st.subheader(":blue[Curse of Dimensionality]")
    st.markdown('''
        - Document increases ↑ Vocabulary ↑ and vector increases ↑ dimensionality also increases ↑
        - Ml performance decreases ↓ - as the dimensionality totally depends on vocabulary and it shootup as the document increases and different
        - As the corpus increases , vocabulary increases -- dimensionality increses
    ''')

    
    st.subheader(":blue[Sparsity]")
    st.markdown('''
        - The vector which is created using BOW gives sparse vector 
        - Entire data is given to any alogorithm and machine is going to learn fom data and algorithm it is baised towards zero values as the data is sparse data
        - This issue in ML is known as overfitting
        - It is solved in Deep learning
    ''')

    st.subheader(":blue[Out of Vocabulary Issue]")
    st.markdown('''
        - Document only converted during training time and we're giving our own dataset
        - If the word is not present in our dataset while training it can't convert into vector format results in key error
        - This is solved by Fasttext
    ''')

    st.subheader(":blue[Inability to Preserve Semantic Meaning]")
    st.markdown('''
    - It slightly preserves semantic meaning
    ''')

    st.subheader(":blue[Lack of Sequential Information]")
    st.markdown('''
        - Sequential information is not preserved 
        - Because in TF-IDF we're giving importance to words as we're doing word tokenization
        - In ML no algorithm is capable of preserving sequential information
        - This is only solved by Deep-learning concept
        - But by applying a trick to BOW/BBOW/TF-IDF we can slightly preserve sequential information
        - That technique is known as n-gram
    ''')

    st.header(":red[n-gram]")
    st.markdown('''
    - n-gram default will always be 1-gram in BOW/BBOW/TF-IDF 
    - Based on n-gram onlt it can create a vocabulary
    - n- gram is mostly used upto 1,2,3 gram only because as dimension increases ML performance decreases
    - n-gram is used to slightly preserve sequential information
    ''')

    st.code('''
            from sklearn.feature_extraction.text import TfidfVectorizer\

            corpus = pd.DataFrame({"Review":["biryani is is is is  rΓ©sume is good","biryani biryani biryani is not good","biryani is too costly"]})
            tf = TfidfVectorizer()

            vector = tf.fit_transform(corpus["Review"])
            vector.toarray()
            tf.vocabulary_
    ''')