Update app.py
Browse files
app.py
CHANGED
|
@@ -1,100 +1,262 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
import re
|
| 3 |
-
from collections import Counter
|
| 4 |
-
|
| 5 |
-
# Custom CSS for styling
|
| 6 |
-
st.markdown(
|
| 7 |
-
"""
|
| 8 |
-
<style>
|
| 9 |
-
.stApp {
|
| 10 |
-
background-color: #f4f4f9;
|
| 11 |
-
color: #333333;
|
| 12 |
-
}
|
| 13 |
-
.stTitle {
|
| 14 |
-
text-align: center;
|
| 15 |
-
color: #4CAF50;
|
| 16 |
-
}
|
| 17 |
-
.section-title {
|
| 18 |
-
font-size: 24px;
|
| 19 |
-
color: #FF5722;
|
| 20 |
-
text-decoration: underline;
|
| 21 |
-
}
|
| 22 |
-
</style>
|
| 23 |
-
""",
|
| 24 |
-
unsafe_allow_html=True,
|
| 25 |
-
)
|
| 26 |
-
|
| 27 |
-
st.title("π Explore Natural Language Processing")
|
| 28 |
-
|
| 29 |
-
# Sidebar navigation
|
| 30 |
-
menu = st.sidebar.radio(
|
| 31 |
-
"Explore Topics",
|
| 32 |
-
["Introduction to NLP", "Tokenization", "Word Frequency Analysis", "Stop Words"]
|
| 33 |
-
)
|
| 34 |
-
|
| 35 |
-
# Introduction to NLP
|
| 36 |
-
if menu == "Introduction to NLP":
|
| 37 |
-
st.header("π What is NLP?")
|
| 38 |
-
st.write(
|
| 39 |
-
"""
|
| 40 |
-
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
|
| 41 |
-
between computers and human language. The goal is to enable machines to understand, interpret, and generate
|
| 42 |
-
human language in a way that is meaningful.
|
| 43 |
-
|
| 44 |
-
NLP has a wide range of applications, including:
|
| 45 |
-
- **Text Classification:** Categorizing text into predefined categories (e.g., spam detection).
|
| 46 |
-
- **Sentiment Analysis:** Determining the sentiment (positive, negative, neutral) of a piece of text.
|
| 47 |
-
- **Machine Translation:** Translating text from one language to another (e.g., Google Translate).
|
| 48 |
-
- **Named Entity Recognition (NER):** Identifying entities like names, dates, or locations in text.
|
| 49 |
-
- **Question Answering:** Answering questions based on input text or documents.
|
| 50 |
-
"""
|
| 51 |
-
)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
st.
|
| 56 |
-
st.write(
|
| 57 |
-
"""
|
| 58 |
-
Tokenization is the process of breaking a text into smaller units, called tokens. These tokens can be words,
|
| 59 |
-
phrases, or characters.
|
| 60 |
-
"""
|
| 61 |
-
)
|
| 62 |
-
text_input = st.text_area("Enter some text to tokenize:", "Natural Language Processing is fascinating!")
|
| 63 |
-
if text_input:
|
| 64 |
-
tokens = text_input.split()
|
| 65 |
-
st.write("**Tokens:**", tokens)
|
| 66 |
-
|
| 67 |
-
# Word Frequency Analysis
|
| 68 |
-
elif menu == "Word Frequency Analysis":
|
| 69 |
-
st.header("π Word Frequency Analysis")
|
| 70 |
-
st.write(
|
| 71 |
-
"""
|
| 72 |
-
Word frequency analysis involves counting the occurrences of each word in a text. This is useful for
|
| 73 |
-
understanding the most common words in a document or dataset.
|
| 74 |
-
"""
|
| 75 |
-
)
|
| 76 |
-
text_input = st.text_area("Enter some text to analyze:", "NLP is fun. NLP can help with many applications.")
|
| 77 |
-
if text_input:
|
| 78 |
-
words = re.findall(r'\w+', text_input.lower())
|
| 79 |
-
word_counts = Counter(words)
|
| 80 |
-
st.write("**Word Frequency:**")
|
| 81 |
-
st.write(word_counts)
|
| 82 |
-
|
| 83 |
-
# Stop Words
|
| 84 |
-
elif menu == "Stop Words":
|
| 85 |
-
st.header("π« Stop Words")
|
| 86 |
-
st.write(
|
| 87 |
"""
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"""
|
| 91 |
)
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
def show_home_page():
|
| 4 |
+
st.title("Comprehensive Guide to Natural Language Processing (NLP)")
|
| 5 |
+
st.markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
+
### Welcome to NLP Guide
|
| 8 |
+
Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between
|
| 9 |
+
computers and humans through natural language. It enables machines to read, understand, and respond to human
|
| 10 |
+
language in a way that is both meaningful and useful. NLP powers a wide range of applications like chatbots,
|
| 11 |
+
translation tools, sentiment analysis, and search engines.
|
| 12 |
+
|
| 13 |
+
Use the buttons below to explore each topic in detail.
|
| 14 |
"""
|
| 15 |
)
|
| 16 |
+
|
| 17 |
+
if st.button("NLP Terminologies"):
|
| 18 |
+
st.experimental_set_query_params(page="terminologies")
|
| 19 |
+
if st.button("One-Hot Vectorization"):
|
| 20 |
+
st.experimental_set_query_params(page="one_hot")
|
| 21 |
+
if st.button("Bag of Words"):
|
| 22 |
+
st.experimental_set_query_params(page="bow")
|
| 23 |
+
if st.button("TF-IDF Vectorizer"):
|
| 24 |
+
st.experimental_set_query_params(page="tfidf")
|
| 25 |
+
if st.button("Word2Vec"):
|
| 26 |
+
st.experimental_set_query_params(page="word2vec")
|
| 27 |
+
if st.button("FastText"):
|
| 28 |
+
st.experimental_set_query_params(page="fasttext")
|
| 29 |
+
if st.button("Tokenization"):
|
| 30 |
+
st.experimental_set_query_params(page="tokenization")
|
| 31 |
+
if st.button("Stop Words"):
|
| 32 |
+
st.experimental_set_query_params(page="stop_words")
|
| 33 |
+
|
| 34 |
+
def show_page(page):
|
| 35 |
+
if page == "terminologies":
|
| 36 |
+
st.title("NLP Terminologies")
|
| 37 |
+
st.markdown(
|
| 38 |
+
"""
|
| 39 |
+
### NLP Terminologies (Detailed Explanation)
|
| 40 |
+
|
| 41 |
+
- **Tokenization**: Tokenization is the process of breaking text into smaller units like words or sentences.
|
| 42 |
+
For example, the sentence "I love NLP" can be tokenized into words: ["I", "love", "NLP"].
|
| 43 |
+
|
| 44 |
+
- **Stop Words**: These are common words in a language (e.g., "the", "is", "and") that are often removed
|
| 45 |
+
during preprocessing because they carry little unique information.
|
| 46 |
+
|
| 47 |
+
- **Stemming**: Stemming reduces words to their root form by removing suffixes. For example, "running" -> "run".
|
| 48 |
+
It may produce non-lexical words (e.g., "better" -> "bett").
|
| 49 |
+
|
| 50 |
+
- **Lemmatization**: Unlike stemming, lemmatization converts a word to its dictionary base form (e.g., "running" -> "run").
|
| 51 |
+
|
| 52 |
+
- **Corpus**: A large collection of text used for NLP training and analysis.
|
| 53 |
+
|
| 54 |
+
- **Vocabulary**: The set of all unique words present in the corpus.
|
| 55 |
+
|
| 56 |
+
- **n-grams**: Continuous sequences of n items (words or characters) from a text. For example, bigrams from "NLP is fun" are ["NLP is", "is fun"].
|
| 57 |
+
|
| 58 |
+
- **POS Tagging**: Assigning parts of speech to words, like noun, verb, etc.
|
| 59 |
+
|
| 60 |
+
- **Named Entity Recognition (NER)**: Identifying entities like names, locations, and organizations in text.
|
| 61 |
+
|
| 62 |
+
- **Parsing**: Analyzing grammatical structure and relationships between words.
|
| 63 |
+
|
| 64 |
+
"""
|
| 65 |
+
)
|
| 66 |
+
elif page == "one_hot":
|
| 67 |
+
st.title("One-Hot Vectorization")
|
| 68 |
+
st.markdown(
|
| 69 |
+
"""
|
| 70 |
+
### One-Hot Vectorization
|
| 71 |
+
|
| 72 |
+
One-hot vectorization is a simple representation where each word in the vocabulary is represented as a binary vector.
|
| 73 |
+
|
| 74 |
+
#### How It Works:
|
| 75 |
+
- Each unique word in the corpus is assigned an index.
|
| 76 |
+
- The vector for a word is all zeros except for a 1 at the index corresponding to that word.
|
| 77 |
+
|
| 78 |
+
#### Example:
|
| 79 |
+
For a vocabulary ["cat", "dog", "bird"]:
|
| 80 |
+
- "cat" -> [1, 0, 0]
|
| 81 |
+
- "dog" -> [0, 1, 0]
|
| 82 |
+
- "bird" -> [0, 0, 1]
|
| 83 |
+
|
| 84 |
+
#### Advantages:
|
| 85 |
+
- Simple to implement.
|
| 86 |
+
|
| 87 |
+
#### Limitations:
|
| 88 |
+
- High dimensionality for large vocabularies.
|
| 89 |
+
- Does not capture semantic relationships (e.g., "cat" and "kitten" are unrelated).
|
| 90 |
+
|
| 91 |
+
#### Applications:
|
| 92 |
+
- Useful for small datasets and when computational simplicity is prioritized.
|
| 93 |
+
"""
|
| 94 |
+
)
|
| 95 |
+
elif page == "bow":
|
| 96 |
+
st.title("Bag of Words (BoW)")
|
| 97 |
+
st.markdown(
|
| 98 |
+
"""
|
| 99 |
+
### Bag of Words (BoW)
|
| 100 |
+
|
| 101 |
+
Bag of Words is a method of representing text data as word frequency counts without considering word order.
|
| 102 |
+
|
| 103 |
+
#### How It Works:
|
| 104 |
+
1. Create a vocabulary of all unique words in the text.
|
| 105 |
+
2. Count the frequency of each word in a document.
|
| 106 |
+
|
| 107 |
+
#### Example:
|
| 108 |
+
Given two sentences:
|
| 109 |
+
- "I love NLP."
|
| 110 |
+
- "I love programming."
|
| 111 |
+
|
| 112 |
+
Vocabulary: ["I", "love", "NLP", "programming"]
|
| 113 |
+
- Sentence 1: [1, 1, 1, 0]
|
| 114 |
+
- Sentence 2: [1, 1, 0, 1]
|
| 115 |
+
|
| 116 |
+
#### Advantages:
|
| 117 |
+
- Simple to implement.
|
| 118 |
+
|
| 119 |
+
#### Limitations:
|
| 120 |
+
- High dimensionality for large vocabularies.
|
| 121 |
+
- Does not consider word order or semantic meaning.
|
| 122 |
+
- Sensitive to noise and frequent terms.
|
| 123 |
+
|
| 124 |
+
#### Applications:
|
| 125 |
+
- Text classification and clustering.
|
| 126 |
+
"""
|
| 127 |
+
)
|
| 128 |
+
elif page == "tfidf":
|
| 129 |
+
st.title("TF-IDF Vectorizer")
|
| 130 |
+
st.markdown(
|
| 131 |
+
"""
|
| 132 |
+
### TF-IDF Vectorizer
|
| 133 |
+
|
| 134 |
+
Term Frequency-Inverse Document Frequency (TF-IDF) is a statistical measure that evaluates the importance of a word in a document relative to a collection of documents (corpus).
|
| 135 |
+
|
| 136 |
+
#### Formula:
|
| 137 |
+
\[ \text{TF-IDF} = \text{TF} \times \text{IDF} \]
|
| 138 |
+
|
| 139 |
+
- **Term Frequency (TF)**: Number of times a term appears in a document divided by total terms in the document.
|
| 140 |
+
- **Inverse Document Frequency (IDF)**: Logarithm of total documents divided by the number of documents containing the term.
|
| 141 |
+
|
| 142 |
+
#### Advantages:
|
| 143 |
+
- Reduces the weight of common words.
|
| 144 |
+
- Highlights unique and important words.
|
| 145 |
+
|
| 146 |
+
#### Example:
|
| 147 |
+
For the corpus:
|
| 148 |
+
- Doc1: "NLP is amazing."
|
| 149 |
+
- Doc2: "NLP is fun and amazing."
|
| 150 |
+
|
| 151 |
+
TF-IDF highlights words like "fun" and "amazing" over commonly occurring words like "is".
|
| 152 |
+
|
| 153 |
+
#### Applications:
|
| 154 |
+
- Search engines, information retrieval, and document classification.
|
| 155 |
+
"""
|
| 156 |
+
)
|
| 157 |
+
elif page == "word2vec":
|
| 158 |
+
st.title("Word2Vec")
|
| 159 |
+
st.markdown(
|
| 160 |
+
"""
|
| 161 |
+
### Word2Vec
|
| 162 |
+
|
| 163 |
+
Word2Vec is a neural network-based technique for creating dense vector representations of words, capturing their semantic relationships.
|
| 164 |
+
|
| 165 |
+
#### Key Concepts:
|
| 166 |
+
- **CBOW (Continuous Bag of Words)**: Predicts the target word from its context.
|
| 167 |
+
- **Skip-gram**: Predicts the context from the target word.
|
| 168 |
+
|
| 169 |
+
#### Advantages:
|
| 170 |
+
- Captures semantic meaning (e.g., "king" - "man" + "woman" β "queen").
|
| 171 |
+
- Efficient for large datasets.
|
| 172 |
+
|
| 173 |
+
#### Training Process:
|
| 174 |
+
- Uses shallow neural networks.
|
| 175 |
+
- Optimized using techniques like negative sampling.
|
| 176 |
+
|
| 177 |
+
#### Applications:
|
| 178 |
+
- Text classification, sentiment analysis, and recommendation systems.
|
| 179 |
+
|
| 180 |
+
#### Limitations:
|
| 181 |
+
- Requires significant computational resources.
|
| 182 |
+
"""
|
| 183 |
+
)
|
| 184 |
+
elif page == "fasttext":
|
| 185 |
+
st.title("FastText")
|
| 186 |
+
st.markdown(
|
| 187 |
+
"""
|
| 188 |
+
### FastText
|
| 189 |
+
|
| 190 |
+
FastText is an extension of Word2Vec that represents words as a combination of character n-grams.
|
| 191 |
+
|
| 192 |
+
#### Advantages:
|
| 193 |
+
- Handles rare and out-of-vocabulary words.
|
| 194 |
+
- Captures subword information (e.g., prefixes and suffixes).
|
| 195 |
+
|
| 196 |
+
#### Example:
|
| 197 |
+
The word "playing" might be represented by n-grams like "pla", "lay", "ayi", "ing".
|
| 198 |
+
|
| 199 |
+
#### Applications:
|
| 200 |
+
- Multilingual text processing.
|
| 201 |
+
- Handling noisy and incomplete data.
|
| 202 |
+
|
| 203 |
+
#### Limitations:
|
| 204 |
+
- Higher computational cost compared to Word2Vec.
|
| 205 |
+
"""
|
| 206 |
+
)
|
| 207 |
+
elif page == "tokenization":
|
| 208 |
+
st.title("Tokenization")
|
| 209 |
+
st.markdown(
|
| 210 |
+
"""
|
| 211 |
+
### Tokenization
|
| 212 |
+
|
| 213 |
+
Tokenization is the process of breaking text into smaller units (tokens) such as words, phrases, or sentences.
|
| 214 |
+
|
| 215 |
+
#### Types of Tokenization:
|
| 216 |
+
- **Word Tokenization**: Splits text into words.
|
| 217 |
+
- **Sentence Tokenization**: Splits text into sentences.
|
| 218 |
+
|
| 219 |
+
#### Libraries for Tokenization:
|
| 220 |
+
- NLTK, SpaCy, and Hugging Face Transformers.
|
| 221 |
+
|
| 222 |
+
#### Example:
|
| 223 |
+
Sentence: "NLP is exciting."
|
| 224 |
+
- Word Tokens: ["NLP", "is", "exciting", "."]
|
| 225 |
+
|
| 226 |
+
#### Applications:
|
| 227 |
+
- Preprocessing for machine learning models.
|
| 228 |
+
|
| 229 |
+
#### Challenges:
|
| 230 |
+
- Handling complex text like abbreviations and multilingual data.
|
| 231 |
+
"""
|
| 232 |
+
)
|
| 233 |
+
elif page == "stop_words":
|
| 234 |
+
st.title("Stop Words")
|
| 235 |
+
st.markdown(
|
| 236 |
+
"""
|
| 237 |
+
### Stop Words
|
| 238 |
+
|
| 239 |
+
Stop words are commonly used words in a language that are often removed during text preprocessing.
|
| 240 |
+
|
| 241 |
+
#### Examples of Stop Words:
|
| 242 |
+
- English: "is", "the", "and", "in".
|
| 243 |
+
- Spanish: "es", "el", "y", "en".
|
| 244 |
+
|
| 245 |
+
#### Why Remove Stop Words?
|
| 246 |
+
- To reduce noise in text data.
|
| 247 |
+
|
| 248 |
+
#### Applications:
|
| 249 |
+
- Sentiment analysis, text classification, and search engines.
|
| 250 |
+
|
| 251 |
+
#### Challenges:
|
| 252 |
+
- Some stop words might carry context-specific importance.
|
| 253 |
+
"""
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
query_params = st.experimental_get_query_params()
|
| 257 |
+
page = query_params.get("page", ["home"])[0]
|
| 258 |
+
|
| 259 |
+
if page == "home":
|
| 260 |
+
show_home_page()
|
| 261 |
+
else:
|
| 262 |
+
show_page(page)
|