Update pages/Introduction.py
Browse files- pages/Introduction.py +24 -2
pages/Introduction.py
CHANGED
|
@@ -67,8 +67,6 @@ st.write('**Term Frequency (TF)** \n - Measures how often a word appears in a si
|
|
| 67 |
st.write('**Inverse Document Frequency (IDF)** \n Measures how unique or rare a word is across all documents in the corpus. \n - Formula: \n _IDF_ = log(Total no.of documents / No of Documnets containing the word) \n Words that appear in many documents (like "the" or "and") will have a low IDF value, while unique words (like "NLP") will have a higher IDF.')
|
| 68 |
st.write('**TF - IDF Score:** \n - Combines TF and IDF to calculate the importance of a word in a document. \n - Formula: \n _TF - IDF = TF x IDF_ \n Words that are frequent in a document but rare in the overall corpus get a higher score.')
|
| 69 |
|
| 70 |
-
st.write("Examples:")
|
| 71 |
-
|
| 72 |
st.write("""
|
| 73 |
**Example**
|
| 74 |
**Consider these two documents:**
|
|
@@ -88,3 +86,27 @@ st.write("""
|
|
| 88 |
- "NLP" gets a TF-IDF score of **1/3 × 0 = 0** (not unique).
|
| 89 |
- "love" and "amazing" get scores of **1/3 × 0.69 = 0.23** (more unique).
|
| 90 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
st.write('**Inverse Document Frequency (IDF)** \n Measures how unique or rare a word is across all documents in the corpus. \n - Formula: \n _IDF_ = log(Total no.of documents / No of Documnets containing the word) \n Words that appear in many documents (like "the" or "and") will have a low IDF value, while unique words (like "NLP") will have a higher IDF.')
|
| 68 |
st.write('**TF - IDF Score:** \n - Combines TF and IDF to calculate the importance of a word in a document. \n - Formula: \n _TF - IDF = TF x IDF_ \n Words that are frequent in a document but rare in the overall corpus get a higher score.')
|
| 69 |
|
|
|
|
|
|
|
| 70 |
st.write("""
|
| 71 |
**Example**
|
| 72 |
**Consider these two documents:**
|
|
|
|
| 86 |
- "NLP" gets a TF-IDF score of **1/3 × 0 = 0** (not unique).
|
| 87 |
- "love" and "amazing" get scores of **1/3 × 0.69 = 0.23** (more unique).
|
| 88 |
""")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
st.markdown('<p style="color:lightblue;"><b>c. Word Embeddings</b></p>', unsafe_allow_html=True)
|
| 92 |
+
st.write("Word embeddings are a type of representation for text where words are converted into dense numerical vectors. These vectors capture the semantic meaning of words and their relationships with other words in a way that computers can understand.")
|
| 93 |
+
|
| 94 |
+
import streamlit as st
|
| 95 |
+
|
| 96 |
+
st.write("""
|
| 97 |
+
**Word Embedding Techniques**
|
| 98 |
+
|
| 99 |
+
**1. Word2Vec**
|
| 100 |
+
Developed by Google, it uses two main approaches:
|
| 101 |
+
- **CBOW (Continuous Bag of Words):** Predicts a word based on its context.
|
| 102 |
+
- **Skip-Gram:** Predicts the context given a word.
|
| 103 |
+
|
| 104 |
+
**2. GloVe (Global Vectors)**
|
| 105 |
+
Developed by Stanford, it captures word relationships by analyzing co-occurrence statistics of words in a large corpus.
|
| 106 |
+
|
| 107 |
+
**3. FastText**
|
| 108 |
+
Developed by Facebook, it extends Word2Vec by considering subword information, making it better at handling rare and misspelled words.
|
| 109 |
+
|
| 110 |
+
**4. Transformers (Contextual Embeddings)**
|
| 111 |
+
Models like **BERT**, **ELMo**, and **GPT** generate embeddings based on the context in which a word appears, capturing nuanced meanings.
|
| 112 |
+
""")
|