Phani1008 commited on
Commit
a5e320e
Β·
verified Β·
1 Parent(s): 507b9f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +257 -95
app.py CHANGED
@@ -1,100 +1,262 @@
1
  import streamlit as st
2
- import re
3
- from collections import Counter
4
-
5
- # Custom CSS for styling
6
- st.markdown(
7
- """
8
- <style>
9
- .stApp {
10
- background-color: #f4f4f9;
11
- color: #333333;
12
- }
13
- .stTitle {
14
- text-align: center;
15
- color: #4CAF50;
16
- }
17
- .section-title {
18
- font-size: 24px;
19
- color: #FF5722;
20
- text-decoration: underline;
21
- }
22
- </style>
23
- """,
24
- unsafe_allow_html=True,
25
- )
26
-
27
- st.title("πŸ”Ž Explore Natural Language Processing")
28
-
29
- # Sidebar navigation
30
- menu = st.sidebar.radio(
31
- "Explore Topics",
32
- ["Introduction to NLP", "Tokenization", "Word Frequency Analysis", "Stop Words"]
33
- )
34
-
35
- # Introduction to NLP
36
- if menu == "Introduction to NLP":
37
- st.header("🌟 What is NLP?")
38
- st.write(
39
- """
40
- Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
41
- between computers and human language. The goal is to enable machines to understand, interpret, and generate
42
- human language in a way that is meaningful.
43
-
44
- NLP has a wide range of applications, including:
45
- - **Text Classification:** Categorizing text into predefined categories (e.g., spam detection).
46
- - **Sentiment Analysis:** Determining the sentiment (positive, negative, neutral) of a piece of text.
47
- - **Machine Translation:** Translating text from one language to another (e.g., Google Translate).
48
- - **Named Entity Recognition (NER):** Identifying entities like names, dates, or locations in text.
49
- - **Question Answering:** Answering questions based on input text or documents.
50
- """
51
- )
52
 
53
- # Tokenization
54
- elif menu == "Tokenization":
55
- st.header("πŸ”  Tokenization")
56
- st.write(
57
- """
58
- Tokenization is the process of breaking a text into smaller units, called tokens. These tokens can be words,
59
- phrases, or characters.
60
- """
61
- )
62
- text_input = st.text_area("Enter some text to tokenize:", "Natural Language Processing is fascinating!")
63
- if text_input:
64
- tokens = text_input.split()
65
- st.write("**Tokens:**", tokens)
66
-
67
- # Word Frequency Analysis
68
- elif menu == "Word Frequency Analysis":
69
- st.header("πŸ“Š Word Frequency Analysis")
70
- st.write(
71
- """
72
- Word frequency analysis involves counting the occurrences of each word in a text. This is useful for
73
- understanding the most common words in a document or dataset.
74
- """
75
- )
76
- text_input = st.text_area("Enter some text to analyze:", "NLP is fun. NLP can help with many applications.")
77
- if text_input:
78
- words = re.findall(r'\w+', text_input.lower())
79
- word_counts = Counter(words)
80
- st.write("**Word Frequency:**")
81
- st.write(word_counts)
82
-
83
- # Stop Words
84
- elif menu == "Stop Words":
85
- st.header("🚫 Stop Words")
86
- st.write(
87
  """
88
- Stop words are common words that usually carry less meaning and are often removed from text analysis
89
- (e.g., 'is', 'and', 'the'). This helps in focusing on the meaningful parts of the text.
 
 
 
 
 
90
  """
91
  )
92
- text_input = st.text_area("Enter some text to remove stop words:", "The quick brown fox jumps over the lazy dog.")
93
- stop_words = {"the", "is", "and", "in", "to", "a", "of", "on", "for"}
94
- if text_input:
95
- words = text_input.split()
96
- filtered_words = [word for word in words if word.lower() not in stop_words]
97
- st.write("**Filtered Text:**", " ".join(filtered_words))
98
-
99
- # Footer
100
- st.sidebar.info("Select a topic to learn about NLP!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ def show_home_page():
4
+ st.title("Comprehensive Guide to Natural Language Processing (NLP)")
5
+ st.markdown(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
+ ### Welcome to NLP Guide
8
+ Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between
9
+ computers and humans through natural language. It enables machines to read, understand, and respond to human
10
+ language in a way that is both meaningful and useful. NLP powers a wide range of applications like chatbots,
11
+ translation tools, sentiment analysis, and search engines.
12
+
13
+ Use the buttons below to explore each topic in detail.
14
  """
15
  )
16
+
17
+ if st.button("NLP Terminologies"):
18
+ st.experimental_set_query_params(page="terminologies")
19
+ if st.button("One-Hot Vectorization"):
20
+ st.experimental_set_query_params(page="one_hot")
21
+ if st.button("Bag of Words"):
22
+ st.experimental_set_query_params(page="bow")
23
+ if st.button("TF-IDF Vectorizer"):
24
+ st.experimental_set_query_params(page="tfidf")
25
+ if st.button("Word2Vec"):
26
+ st.experimental_set_query_params(page="word2vec")
27
+ if st.button("FastText"):
28
+ st.experimental_set_query_params(page="fasttext")
29
+ if st.button("Tokenization"):
30
+ st.experimental_set_query_params(page="tokenization")
31
+ if st.button("Stop Words"):
32
+ st.experimental_set_query_params(page="stop_words")
33
+
34
+ def show_page(page):
35
+ if page == "terminologies":
36
+ st.title("NLP Terminologies")
37
+ st.markdown(
38
+ """
39
+ ### NLP Terminologies (Detailed Explanation)
40
+
41
+ - **Tokenization**: Tokenization is the process of breaking text into smaller units like words or sentences.
42
+ For example, the sentence "I love NLP" can be tokenized into words: ["I", "love", "NLP"].
43
+
44
+ - **Stop Words**: These are common words in a language (e.g., "the", "is", "and") that are often removed
45
+ during preprocessing because they carry little unique information.
46
+
47
+ - **Stemming**: Stemming reduces words to their root form by removing suffixes. For example, "running" -> "run".
48
+ It may produce non-lexical words (e.g., "better" -> "bett").
49
+
50
+ - **Lemmatization**: Unlike stemming, lemmatization converts a word to its dictionary base form (e.g., "running" -> "run").
51
+
52
+ - **Corpus**: A large collection of text used for NLP training and analysis.
53
+
54
+ - **Vocabulary**: The set of all unique words present in the corpus.
55
+
56
+ - **n-grams**: Continuous sequences of n items (words or characters) from a text. For example, bigrams from "NLP is fun" are ["NLP is", "is fun"].
57
+
58
+ - **POS Tagging**: Assigning parts of speech to words, like noun, verb, etc.
59
+
60
+ - **Named Entity Recognition (NER)**: Identifying entities like names, locations, and organizations in text.
61
+
62
+ - **Parsing**: Analyzing grammatical structure and relationships between words.
63
+
64
+ """
65
+ )
66
+ elif page == "one_hot":
67
+ st.title("One-Hot Vectorization")
68
+ st.markdown(
69
+ """
70
+ ### One-Hot Vectorization
71
+
72
+ One-hot vectorization is a simple representation where each word in the vocabulary is represented as a binary vector.
73
+
74
+ #### How It Works:
75
+ - Each unique word in the corpus is assigned an index.
76
+ - The vector for a word is all zeros except for a 1 at the index corresponding to that word.
77
+
78
+ #### Example:
79
+ For a vocabulary ["cat", "dog", "bird"]:
80
+ - "cat" -> [1, 0, 0]
81
+ - "dog" -> [0, 1, 0]
82
+ - "bird" -> [0, 0, 1]
83
+
84
+ #### Advantages:
85
+ - Simple to implement.
86
+
87
+ #### Limitations:
88
+ - High dimensionality for large vocabularies.
89
+ - Does not capture semantic relationships (e.g., "cat" and "kitten" are unrelated).
90
+
91
+ #### Applications:
92
+ - Useful for small datasets and when computational simplicity is prioritized.
93
+ """
94
+ )
95
+ elif page == "bow":
96
+ st.title("Bag of Words (BoW)")
97
+ st.markdown(
98
+ """
99
+ ### Bag of Words (BoW)
100
+
101
+ Bag of Words is a method of representing text data as word frequency counts without considering word order.
102
+
103
+ #### How It Works:
104
+ 1. Create a vocabulary of all unique words in the text.
105
+ 2. Count the frequency of each word in a document.
106
+
107
+ #### Example:
108
+ Given two sentences:
109
+ - "I love NLP."
110
+ - "I love programming."
111
+
112
+ Vocabulary: ["I", "love", "NLP", "programming"]
113
+ - Sentence 1: [1, 1, 1, 0]
114
+ - Sentence 2: [1, 1, 0, 1]
115
+
116
+ #### Advantages:
117
+ - Simple to implement.
118
+
119
+ #### Limitations:
120
+ - High dimensionality for large vocabularies.
121
+ - Does not consider word order or semantic meaning.
122
+ - Sensitive to noise and frequent terms.
123
+
124
+ #### Applications:
125
+ - Text classification and clustering.
126
+ """
127
+ )
128
+ elif page == "tfidf":
129
+ st.title("TF-IDF Vectorizer")
130
+ st.markdown(
131
+ """
132
+ ### TF-IDF Vectorizer
133
+
134
+ Term Frequency-Inverse Document Frequency (TF-IDF) is a statistical measure that evaluates the importance of a word in a document relative to a collection of documents (corpus).
135
+
136
+ #### Formula:
137
+ \[ \text{TF-IDF} = \text{TF} \times \text{IDF} \]
138
+
139
+ - **Term Frequency (TF)**: Number of times a term appears in a document divided by total terms in the document.
140
+ - **Inverse Document Frequency (IDF)**: Logarithm of total documents divided by the number of documents containing the term.
141
+
142
+ #### Advantages:
143
+ - Reduces the weight of common words.
144
+ - Highlights unique and important words.
145
+
146
+ #### Example:
147
+ For the corpus:
148
+ - Doc1: "NLP is amazing."
149
+ - Doc2: "NLP is fun and amazing."
150
+
151
+ TF-IDF highlights words like "fun" and "amazing" over commonly occurring words like "is".
152
+
153
+ #### Applications:
154
+ - Search engines, information retrieval, and document classification.
155
+ """
156
+ )
157
+ elif page == "word2vec":
158
+ st.title("Word2Vec")
159
+ st.markdown(
160
+ """
161
+ ### Word2Vec
162
+
163
+ Word2Vec is a neural network-based technique for creating dense vector representations of words, capturing their semantic relationships.
164
+
165
+ #### Key Concepts:
166
+ - **CBOW (Continuous Bag of Words)**: Predicts the target word from its context.
167
+ - **Skip-gram**: Predicts the context from the target word.
168
+
169
+ #### Advantages:
170
+ - Captures semantic meaning (e.g., "king" - "man" + "woman" β‰ˆ "queen").
171
+ - Efficient for large datasets.
172
+
173
+ #### Training Process:
174
+ - Uses shallow neural networks.
175
+ - Optimized using techniques like negative sampling.
176
+
177
+ #### Applications:
178
+ - Text classification, sentiment analysis, and recommendation systems.
179
+
180
+ #### Limitations:
181
+ - Requires significant computational resources.
182
+ """
183
+ )
184
+ elif page == "fasttext":
185
+ st.title("FastText")
186
+ st.markdown(
187
+ """
188
+ ### FastText
189
+
190
+ FastText is an extension of Word2Vec that represents words as a combination of character n-grams.
191
+
192
+ #### Advantages:
193
+ - Handles rare and out-of-vocabulary words.
194
+ - Captures subword information (e.g., prefixes and suffixes).
195
+
196
+ #### Example:
197
+ The word "playing" might be represented by n-grams like "pla", "lay", "ayi", "ing".
198
+
199
+ #### Applications:
200
+ - Multilingual text processing.
201
+ - Handling noisy and incomplete data.
202
+
203
+ #### Limitations:
204
+ - Higher computational cost compared to Word2Vec.
205
+ """
206
+ )
207
+ elif page == "tokenization":
208
+ st.title("Tokenization")
209
+ st.markdown(
210
+ """
211
+ ### Tokenization
212
+
213
+ Tokenization is the process of breaking text into smaller units (tokens) such as words, phrases, or sentences.
214
+
215
+ #### Types of Tokenization:
216
+ - **Word Tokenization**: Splits text into words.
217
+ - **Sentence Tokenization**: Splits text into sentences.
218
+
219
+ #### Libraries for Tokenization:
220
+ - NLTK, SpaCy, and Hugging Face Transformers.
221
+
222
+ #### Example:
223
+ Sentence: "NLP is exciting."
224
+ - Word Tokens: ["NLP", "is", "exciting", "."]
225
+
226
+ #### Applications:
227
+ - Preprocessing for machine learning models.
228
+
229
+ #### Challenges:
230
+ - Handling complex text like abbreviations and multilingual data.
231
+ """
232
+ )
233
+ elif page == "stop_words":
234
+ st.title("Stop Words")
235
+ st.markdown(
236
+ """
237
+ ### Stop Words
238
+
239
+ Stop words are commonly used words in a language that are often removed during text preprocessing.
240
+
241
+ #### Examples of Stop Words:
242
+ - English: "is", "the", "and", "in".
243
+ - Spanish: "es", "el", "y", "en".
244
+
245
+ #### Why Remove Stop Words?
246
+ - To reduce noise in text data.
247
+
248
+ #### Applications:
249
+ - Sentiment analysis, text classification, and search engines.
250
+
251
+ #### Challenges:
252
+ - Some stop words might carry context-specific importance.
253
+ """
254
+ )
255
+
256
+ query_params = st.experimental_get_query_params()
257
+ page = query_params.get("page", ["home"])[0]
258
+
259
+ if page == "home":
260
+ show_home_page()
261
+ else:
262
+ show_page(page)