Update pages/3_Terminology.py
Browse files- pages/3_Terminology.py +55 -79
pages/3_Terminology.py
CHANGED
|
@@ -76,85 +76,61 @@ st.markdown("""
|
|
| 76 |
</style>
|
| 77 |
""", unsafe_allow_html=True)
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
st.markdown(
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
st.
|
| 87 |
-
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
st.
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
st.header("
|
| 96 |
-
st.markdown(
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
st.
|
| 102 |
-
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
st.header("Character")
|
| 106 |
-
st.markdown('''
|
| 107 |
-
- Character can either be in number , alphabets or special symbol.
|
| 108 |
-
''')
|
| 109 |
-
|
| 110 |
-
st.header("Tokenization")
|
| 111 |
-
st.markdown('''
|
| 112 |
-
- It is a technique by using which we can convert a huge chunk into small entity where those small entities are known as tokens.
|
| 113 |
-
''')
|
| 114 |
-
|
| 115 |
-
st.subheader("Types of Tokenization")
|
| 116 |
st.markdown("""
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
st.subheader("
|
| 125 |
-
st.markdown(
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
st.
|
| 131 |
-
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
st.subheader("Character tokenization")
|
| 135 |
-
st.markdown('''
|
| 136 |
-
- It is a technique by using which we can convert a huge chunk into small entity where those small entities are known as tokens which are in characters.
|
| 137 |
-
''')
|
| 138 |
-
|
| 139 |
-
st.header("Stop Words")
|
| 140 |
-
st.markdown('''
|
| 141 |
-
- They are set of words which didn't have impact on the meaning of sentence / paragraph
|
| 142 |
-
- Stop words are used to make the grammar very clear
|
| 143 |
-
''')
|
| 144 |
-
|
| 145 |
-
st.header("Vectorization")
|
| 146 |
-
st.markdown('''
|
| 147 |
-
- It is a technique which helps us to convert a text into vector format
|
| 148 |
-
''')
|
| 149 |
-
|
| 150 |
-
st.subheader("Different types of techniques")
|
| 151 |
st.markdown("""
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
| 76 |
</style>
|
| 77 |
""", unsafe_allow_html=True)
|
| 78 |
|
| 79 |
+
|
| 80 |
+
st.markdown("<h1 class='title'>π NLP Terminology</h1>", unsafe_allow_html=True)
|
| 81 |
+
st.markdown("<p class='caption'>β¨ Explore essential terms in Natural Language Processing and their meanings!...</p>", unsafe_allow_html=True)
|
| 82 |
+
|
| 83 |
+
st.header("π Corpus")
|
| 84 |
+
st.markdown("- **A corpus** is a collection of documents.")
|
| 85 |
+
|
| 86 |
+
st.header("π Document")
|
| 87 |
+
st.markdown("- **A document** is a collection of sentences, paragraphs, single words, or even single characters.")
|
| 88 |
+
|
| 89 |
+
st.header("π Paragraph")
|
| 90 |
+
st.markdown("- **A paragraph** consists of multiple sentences.")
|
| 91 |
+
|
| 92 |
+
st.header("π’ Sentence")
|
| 93 |
+
st.markdown("- **A sentence** is a collection of words.")
|
| 94 |
+
|
| 95 |
+
st.header("π€ Word")
|
| 96 |
+
st.markdown("- **Words** are made up of characters.")
|
| 97 |
+
|
| 98 |
+
st.header("π Character")
|
| 99 |
+
st.markdown("- **A character** can be a number, alphabet, or special symbol.")
|
| 100 |
+
|
| 101 |
+
st.header("βοΈ Tokenization")
|
| 102 |
+
st.markdown("- **Tokenization** is a technique by using which we can convert a huge chunk into small entity where those small entities are known as tokens.")
|
| 103 |
+
|
| 104 |
+
st.subheader("π οΈ Types of Tokenization")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
st.markdown("""
|
| 106 |
+
- πΉ **Sentence Tokenization** β Splits text into sentences.
|
| 107 |
+
- πΉ **Word Tokenization** β Splits sentences into words.
|
| 108 |
+
- πΉ **Character Tokenization** β Splits words into individual characters.
|
| 109 |
+
""")
|
| 110 |
+
|
| 111 |
+
st.subheader("π Sentence Tokenization")
|
| 112 |
+
st.markdown("- **Breaks a large text into meaningful sentence units.**")
|
| 113 |
+
|
| 114 |
+
st.subheader("π Word Tokenization")
|
| 115 |
+
st.markdown("- **Splits a sentence into individual words.**")
|
| 116 |
|
| 117 |
+
st.subheader("π‘ Character Tokenization")
|
| 118 |
+
st.markdown("- **Breaks words into separate characters.**")
|
| 119 |
+
|
| 120 |
+
st.header("π« Stop Words")
|
| 121 |
+
st.markdown("- **Common words** (e.g., 'the', 'is', 'and') that do not add meaning to the text but maintain grammatical structure.")
|
| 122 |
+
|
| 123 |
+
st.header("π Vectorization")
|
| 124 |
+
st.markdown("- **Transforms text into numerical representation** for machine learning models.")
|
| 125 |
+
|
| 126 |
+
st.subheader("π’ Different Types of Vectorization Techniques")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
st.markdown("""
|
| 128 |
+
- π― **One-Hot Encoding**
|
| 129 |
+
- π·οΈ **Bag of Words (BoW)**
|
| 130 |
+
- π **TF-IDF (Term Frequency-Inverse Document Frequency)**
|
| 131 |
+
- π§ **Word2Vec**
|
| 132 |
+
- π **GloVe**
|
| 133 |
+
- β‘ **FastText**
|
| 134 |
+
""")
|
| 135 |
+
|
| 136 |
+
st.success("π Mastering these **NLP terminologies** will help you build powerful text-processing applications!")
|