Update pages/Introduction.py
Browse files- pages/Introduction.py +23 -1
pages/Introduction.py
CHANGED
|
@@ -65,4 +65,26 @@ st.write("The **TF-IDF Vectorizer** is a popular technique in Natural Language P
|
|
| 65 |
|
| 66 |
st.write('**Term Frequency (TF)** \n - Measures how often a word appears in a single document. \n - Formula: \n _TF_ = Number of times the word appears in the document / Total number of words in the document' )
|
| 67 |
st.write('**Inverse Document Frequency (IDF)** \n Measures how unique or rare a word is across all documents in the corpus. \n - Formula: \n _IDF_ = log(Total no.of documents / No of Documnets containing the word) \n Words that appear in many documents (like "the" or "and") will have a low IDF value, while unique words (like "NLP") will have a higher IDF.')
|
| 68 |
-
st.write('**TF - IDF Score:** \n - Combines TF and IDF to calculate the importance of a word in a document. \n - Formula: \n
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
st.write('**Term Frequency (TF)** \n - Measures how often a word appears in a single document. \n - Formula: \n _TF_ = Number of times the word appears in the document / Total number of words in the document' )
|
| 67 |
st.write('**Inverse Document Frequency (IDF)** \n Measures how unique or rare a word is across all documents in the corpus. \n - Formula: \n _IDF_ = log(Total no.of documents / No of Documnets containing the word) \n Words that appear in many documents (like "the" or "and") will have a low IDF value, while unique words (like "NLP") will have a higher IDF.')
|
| 68 |
+
st.write('**TF - IDF Score:** \n - Combines TF and IDF to calculate the importance of a word in a document. \n - Formula: \n _TF - IDF = TF x IDF_ \n Words that are frequent in a document but rare in the overall corpus get a higher score.')
|
| 69 |
+
|
| 70 |
+
st.write("Examples:")
|
| 71 |
+
|
| 72 |
+
st.write("""
|
| 73 |
+
### Example
|
| 74 |
+
**Consider these two documents:**
|
| 75 |
+
|
| 76 |
+
- "I love NLP"
|
| 77 |
+
- "NLP is amazing"
|
| 78 |
+
|
| 79 |
+
#### Step 1: Calculate TF
|
| 80 |
+
- "NLP" appears once in each document, so its TF is **1/3** in both.
|
| 81 |
+
- Words like "love" and "amazing" also have a TF of **1/3**.
|
| 82 |
+
|
| 83 |
+
#### Step 2: Calculate IDF
|
| 84 |
+
- "NLP" appears in both documents, so its IDF is **log(2/2) = 0**.
|
| 85 |
+
- "love" and "amazing" appear in only one document each, so their IDF is **log(2/1) = 0.69**.
|
| 86 |
+
|
| 87 |
+
#### Step 3: Compute TF-IDF
|
| 88 |
+
- "NLP" gets a TF-IDF score of **1/3 × 0 = 0** (not unique).
|
| 89 |
+
- "love" and "amazing" get scores of **1/3 × 0.69 = 0.23** (more unique).
|
| 90 |
+
""")
|