Harika22 commited on
Commit
bcbd8c9
·
verified ·
1 Parent(s): 628ffb3

Update pages/3_Terminology.py

Browse files
Files changed (1) hide show
  1. pages/3_Terminology.py +45 -84
pages/3_Terminology.py CHANGED
@@ -100,61 +100,37 @@ st.markdown(
100
  "<p class='caption'>Explore essential terms in Natural Language Processing and their meanings!...</p>",
101
  unsafe_allow_html=True,
102
  )
 
 
 
 
103
 
104
- st.markdown(
105
- """
106
- <p class="section"><span class="term">Documents</span><br>
107
- Document is defined as collection of sentence / paragraph / single word / single character
108
- </p>
109
- """,
110
- unsafe_allow_html=True,
111
- )
112
 
113
- st.markdown(
114
- """
115
- <p class="section"><span class="term">Paragraph</span><br>
116
- Paragraph is defined as collection of sentence.
117
- </p>
118
- """,
119
- unsafe_allow_html=True,
120
- )
121
 
122
- st.markdown(
123
- """
124
- <p class="section"><span class="term">Sentence</span><br>
125
- Sentence is defined as collection of words.
126
- </p>
127
- """,
128
- unsafe_allow_html=True,
129
- )
130
 
131
- st.markdown(
132
- """
133
- <p class="section"><span class="term">Words</span><br>
134
- Words are defined as collection of characters
135
- </p>
136
- """,
137
- unsafe_allow_html=True,
138
- )
139
 
140
- st.markdown(
141
- """
142
- <p class="section"><span class="term">Character</span><br>
143
- Character can either be in number , alphabets or special symbol.
144
- </p>
145
- """,
146
- unsafe_allow_html=True,
147
- )
148
 
149
- st.markdown(
150
- """
151
- <p class="section"><span class="term">Tokenization</span><br>
152
- It is a technique by using which we can convert a huge chunk into small entity where those small entities are known as tokens.
153
- </p>
154
- """,
155
- unsafe_allow_html=True,
156
- )
157
- st.header("Types of Tokenization")
158
  st.markdown("""
159
  <ul class="icon-bullet">
160
  <li>Sentence tokenization</li>
@@ -178,40 +154,25 @@ st.markdown('''
178
  - It is a technique by using which we can convert a huge chunk into small entity where those small entities are known as tokens which are in characters.
179
  ''')
180
 
 
 
 
 
 
181
 
182
- st.markdown(
183
- """
184
- <p class="section"><span class="term">Bag-of-Words (BoW)</span><br>
185
- Bag-of-Words is a simple representation of text data where each word is treated as a feature. The order of words is ignored, and the text is represented by a frequency count of words in the document.
186
- </p>
187
- """,
188
- unsafe_allow_html=True,
189
- )
190
-
191
- st.markdown(
192
- """
193
- <p class="section"><span class="term">TF-IDF (Term Frequency - Inverse Document Frequency)</span><br>
194
- TF-IDF is a statistic used to evaluate the importance of a word in a document relative to all other documents. It balances the frequency of a word in a document with its rarity across the entire dataset.
195
- </p>
196
- """,
197
- unsafe_allow_html=True,
198
- )
199
-
200
- st.markdown(
201
- """
202
- <p class="section"><span class="term">Sentiment Analysis</span><br>
203
- Sentiment Analysis is the task of determining the sentiment or opinion expressed in text. It is often used to analyze social media posts, customer feedback, and reviews to gauge public opinion.
204
- </p>
205
- """,
206
- unsafe_allow_html=True,
207
- )
208
-
209
- st.markdown(
210
- """
211
- <p class="section"><span class="term">Language Model</span><br>
212
- A language model predicts the probability of a sequence of words occurring in a sentence. Popular models include GPT, BERT, and LSTM, which help in text generation, translation, and summarization tasks.
213
- </p>
214
- """,
215
- unsafe_allow_html=True,
216
- )
217
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  "<p class='caption'>Explore essential terms in Natural Language Processing and their meanings!...</p>",
101
  unsafe_allow_html=True,
102
  )
103
+ st.header("Document")
104
+ st.markdown('''
105
+ - Document is defined as collection of sentence / paragraph / single word / single character
106
+ ''')
107
 
108
+ st.header("Paragraph")
109
+ st.markdown('''
110
+ - Paragraph is defined as collection of sentence.
111
+ ''')
 
 
 
 
112
 
113
+ st.header("Sentence")
114
+ st.markdown('''
115
+ - Sentence is defined as collection of words.
116
+ ''')
 
 
 
 
117
 
118
+ st.header("Word")
119
+ st.markdown('''
120
+ - Words are defined as collection of characters
121
+ ''')
 
 
 
 
122
 
123
+ st.header("Character")
124
+ st.markdown('''
125
+ - Character can either be in number , alphabets or special symbol.
126
+ ''')
 
 
 
 
127
 
128
+ st.header("Tokenization")
129
+ st.markdown('''
130
+ - It is a technique by using which we can convert a huge chunk into small entity where those small entities are known as tokens.
131
+ ''')
 
 
 
 
132
 
133
+ st.subheader("Types of Tokenization")
 
 
 
 
 
 
 
 
134
  st.markdown("""
135
  <ul class="icon-bullet">
136
  <li>Sentence tokenization</li>
 
154
  - It is a technique by using which we can convert a huge chunk into small entity where those small entities are known as tokens which are in characters.
155
  ''')
156
 
157
+ st.header("Stop Words")
158
+ st.markdown('''
159
+ - They are set of words which didn't have impact on the meaning of sentence / paragraph
160
+ - Stop words are used to make the grammar very clear
161
+ ''')
162
 
163
+ st.header("Vectorization")
164
+ st.markdown('''
165
+ - It is a technique which helps us to convert a text into vector format
166
+ ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ st.subheader("Different types of techniques")
169
+ st.markdown("""
170
+ <ul class="icon-bullet">
171
+ <li>One-Hot Vectorization </li>
172
+ <li>Bag of Words</li>
173
+ <li>TF-IDF (Term Frequency and Inverse Document Frequency)</li>
174
+ <li>Word2Vector</li>
175
+ <li>Glove</li>
176
+ <li>Fast text</li>
177
+ </ul>
178
+ """, unsafe_allow_html=True)