Spaces:

Phani1008
/

Natural_Language_Processing

Sleeping

App Files Files Community

Phani1008 commited on Jan 27, 2025

Commit

a5e320e

verified ·

1 Parent(s): 507b9f3

Update app.py

Browse files

Files changed (1) hide show

app.py +257 -95

app.py CHANGED Viewed

@@ -1,100 +1,262 @@
 import streamlit as st
-import re
-from collections import Counter
-# Custom CSS for styling
-st.markdown(
-    """
-    <style>
-    .stApp {
-        background-color: #f4f4f9;
-        color: #333333;
-    }
-    .stTitle {
-        text-align: center;
-        color: #4CAF50;
-    }
-    .section-title {
-        font-size: 24px;
-        color: #FF5722;
-        text-decoration: underline;
-    }
-    </style>
-    """,
-    unsafe_allow_html=True,
-)
-st.title("🔎 Explore Natural Language Processing")
-# Sidebar navigation
-menu = st.sidebar.radio(
-    "Explore Topics",
-    ["Introduction to NLP", "Tokenization", "Word Frequency Analysis", "Stop Words"]
-)
-# Introduction to NLP
-if menu == "Introduction to NLP":
-    st.header("🌟 What is NLP?")
-    st.write(
-        """
-        Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
-        between computers and human language. The goal is to enable machines to understand, interpret, and generate
-        human language in a way that is meaningful.
-        NLP has a wide range of applications, including:
-        - **Text Classification:** Categorizing text into predefined categories (e.g., spam detection).
-        - **Sentiment Analysis:** Determining the sentiment (positive, negative, neutral) of a piece of text.
-        - **Machine Translation:** Translating text from one language to another (e.g., Google Translate).
-        - **Named Entity Recognition (NER):** Identifying entities like names, dates, or locations in text.
-        - **Question Answering:** Answering questions based on input text or documents.
-        """
-    )
-# Tokenization
-elif menu == "Tokenization":
-    st.header("🔠 Tokenization")
-    st.write(
-        """
-        Tokenization is the process of breaking a text into smaller units, called tokens. These tokens can be words,
-        phrases, or characters.
-        """
-    )
-    text_input = st.text_area("Enter some text to tokenize:", "Natural Language Processing is fascinating!")
-    if text_input:
-        tokens = text_input.split()
-        st.write("**Tokens:**", tokens)
-# Word Frequency Analysis
-elif menu == "Word Frequency Analysis":
-    st.header("📊 Word Frequency Analysis")
-    st.write(
-        """
-        Word frequency analysis involves counting the occurrences of each word in a text. This is useful for
-        understanding the most common words in a document or dataset.
-        """
-    )
-    text_input = st.text_area("Enter some text to analyze:", "NLP is fun. NLP can help with many applications.")
-    if text_input:
-        words = re.findall(r'\w+', text_input.lower())
-        word_counts = Counter(words)
-        st.write("**Word Frequency:**")
-        st.write(word_counts)
-# Stop Words
-elif menu == "Stop Words":
-    st.header("🚫 Stop Words")
-    st.write(
         """
-        Stop words are common words that usually carry less meaning and are often removed from text analysis
-        (e.g., 'is', 'and', 'the'). This helps in focusing on the meaningful parts of the text.
         """
     )
-    text_input = st.text_area("Enter some text to remove stop words:", "The quick brown fox jumps over the lazy dog.")
-    stop_words = {"the", "is", "and", "in", "to", "a", "of", "on", "for"}
-    if text_input:
-        words = text_input.split()
-        filtered_words = [word for word in words if word.lower() not in stop_words]
-        st.write("**Filtered Text:**", " ".join(filtered_words))
-# Footer
-st.sidebar.info("Select a topic to learn about NLP!")

 import streamlit as st
+def show_home_page():
+    st.title("Comprehensive Guide to Natural Language Processing (NLP)")
+    st.markdown(
         """
+        ### Welcome to NLP Guide
+        Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between
+        computers and humans through natural language. It enables machines to read, understand, and respond to human
+        language in a way that is both meaningful and useful. NLP powers a wide range of applications like chatbots,
+        translation tools, sentiment analysis, and search engines.
+        Use the buttons below to explore each topic in detail.
         """
     )
+    if st.button("NLP Terminologies"):
+        st.experimental_set_query_params(page="terminologies")
+    if st.button("One-Hot Vectorization"):
+        st.experimental_set_query_params(page="one_hot")
+    if st.button("Bag of Words"):
+        st.experimental_set_query_params(page="bow")
+    if st.button("TF-IDF Vectorizer"):
+        st.experimental_set_query_params(page="tfidf")
+    if st.button("Word2Vec"):
+        st.experimental_set_query_params(page="word2vec")
+    if st.button("FastText"):
+        st.experimental_set_query_params(page="fasttext")
+    if st.button("Tokenization"):
+        st.experimental_set_query_params(page="tokenization")
+    if st.button("Stop Words"):
+        st.experimental_set_query_params(page="stop_words")
+def show_page(page):
+    if page == "terminologies":
+        st.title("NLP Terminologies")
+        st.markdown(
+            """
+            ### NLP Terminologies (Detailed Explanation)
+            - **Tokenization**: Tokenization is the process of breaking text into smaller units like words or sentences.
+              For example, the sentence "I love NLP" can be tokenized into words: ["I", "love", "NLP"].
+            - **Stop Words**: These are common words in a language (e.g., "the", "is", "and") that are often removed
+              during preprocessing because they carry little unique information.
+            - **Stemming**: Stemming reduces words to their root form by removing suffixes. For example, "running" -> "run".
+              It may produce non-lexical words (e.g., "better" -> "bett").
+            - **Lemmatization**: Unlike stemming, lemmatization converts a word to its dictionary base form (e.g., "running" -> "run").
+            - **Corpus**: A large collection of text used for NLP training and analysis.
+            - **Vocabulary**: The set of all unique words present in the corpus.
+            - **n-grams**: Continuous sequences of n items (words or characters) from a text. For example, bigrams from "NLP is fun" are ["NLP is", "is fun"].
+            - **POS Tagging**: Assigning parts of speech to words, like noun, verb, etc.
+            - **Named Entity Recognition (NER)**: Identifying entities like names, locations, and organizations in text.
+            - **Parsing**: Analyzing grammatical structure and relationships between words.
+            """
+        )
+    elif page == "one_hot":
+        st.title("One-Hot Vectorization")
+        st.markdown(
+            """
+            ### One-Hot Vectorization
+            One-hot vectorization is a simple representation where each word in the vocabulary is represented as a binary vector.
+            #### How It Works:
+            - Each unique word in the corpus is assigned an index.
+            - The vector for a word is all zeros except for a 1 at the index corresponding to that word.
+            #### Example:
+            For a vocabulary ["cat", "dog", "bird"]:
+            - "cat" -> [1, 0, 0]
+            - "dog" -> [0, 1, 0]
+            - "bird" -> [0, 0, 1]
+            #### Advantages:
+            - Simple to implement.
+            #### Limitations:
+            - High dimensionality for large vocabularies.
+            - Does not capture semantic relationships (e.g., "cat" and "kitten" are unrelated).
+            #### Applications:
+            - Useful for small datasets and when computational simplicity is prioritized.
+            """
+        )
+    elif page == "bow":
+        st.title("Bag of Words (BoW)")
+        st.markdown(
+            """
+            ### Bag of Words (BoW)
+            Bag of Words is a method of representing text data as word frequency counts without considering word order.
+            #### How It Works:
+            1. Create a vocabulary of all unique words in the text.
+            2. Count the frequency of each word in a document.
+            #### Example:
+            Given two sentences:
+            - "I love NLP."
+            - "I love programming."
+            Vocabulary: ["I", "love", "NLP", "programming"]
+            - Sentence 1: [1, 1, 1, 0]
+            - Sentence 2: [1, 1, 0, 1]
+            #### Advantages:
+            - Simple to implement.
+            #### Limitations:
+            - High dimensionality for large vocabularies.
+            - Does not consider word order or semantic meaning.
+            - Sensitive to noise and frequent terms.
+            #### Applications:
+            - Text classification and clustering.
+            """
+        )
+    elif page == "tfidf":
+        st.title("TF-IDF Vectorizer")
+        st.markdown(
+            """
+            ### TF-IDF Vectorizer
+            Term Frequency-Inverse Document Frequency (TF-IDF) is a statistical measure that evaluates the importance of a word in a document relative to a collection of documents (corpus).
+            #### Formula:
+            \[ \text{TF-IDF} = \text{TF} \times \text{IDF} \]
+            - **Term Frequency (TF)**: Number of times a term appears in a document divided by total terms in the document.
+            - **Inverse Document Frequency (IDF)**: Logarithm of total documents divided by the number of documents containing the term.
+            #### Advantages:
+            - Reduces the weight of common words.
+            - Highlights unique and important words.
+            #### Example:
+            For the corpus:
+            - Doc1: "NLP is amazing."
+            - Doc2: "NLP is fun and amazing."
+            TF-IDF highlights words like "fun" and "amazing" over commonly occurring words like "is".
+            #### Applications:
+            - Search engines, information retrieval, and document classification.
+            """
+        )
+    elif page == "word2vec":
+        st.title("Word2Vec")
+        st.markdown(
+            """
+            ### Word2Vec
+            Word2Vec is a neural network-based technique for creating dense vector representations of words, capturing their semantic relationships.
+            #### Key Concepts:
+            - **CBOW (Continuous Bag of Words)**: Predicts the target word from its context.
+            - **Skip-gram**: Predicts the context from the target word.
+            #### Advantages:
+            - Captures semantic meaning (e.g., "king" - "man" + "woman" ≈ "queen").
+            - Efficient for large datasets.
+            #### Training Process:
+            - Uses shallow neural networks.
+            - Optimized using techniques like negative sampling.
+            #### Applications:
+            - Text classification, sentiment analysis, and recommendation systems.
+            #### Limitations:
+            - Requires significant computational resources.
+            """
+        )
+    elif page == "fasttext":
+        st.title("FastText")
+        st.markdown(
+            """
+            ### FastText
+            FastText is an extension of Word2Vec that represents words as a combination of character n-grams.
+            #### Advantages:
+            - Handles rare and out-of-vocabulary words.
+            - Captures subword information (e.g., prefixes and suffixes).
+            #### Example:
+            The word "playing" might be represented by n-grams like "pla", "lay", "ayi", "ing".
+            #### Applications:
+            - Multilingual text processing.
+            - Handling noisy and incomplete data.
+            #### Limitations:
+            - Higher computational cost compared to Word2Vec.
+            """
+        )
+    elif page == "tokenization":
+        st.title("Tokenization")
+        st.markdown(
+            """
+            ### Tokenization
+            Tokenization is the process of breaking text into smaller units (tokens) such as words, phrases, or sentences.
+            #### Types of Tokenization:
+            - **Word Tokenization**: Splits text into words.
+            - **Sentence Tokenization**: Splits text into sentences.
+            #### Libraries for Tokenization:
+            - NLTK, SpaCy, and Hugging Face Transformers.
+            #### Example:
+            Sentence: "NLP is exciting."
+            - Word Tokens: ["NLP", "is", "exciting", "."]
+            #### Applications:
+            - Preprocessing for machine learning models.
+            #### Challenges:
+            - Handling complex text like abbreviations and multilingual data.
+            """
+        )
+    elif page == "stop_words":
+        st.title("Stop Words")
+        st.markdown(
+            """
+            ### Stop Words
+            Stop words are commonly used words in a language that are often removed during text preprocessing.
+            #### Examples of Stop Words:
+            - English: "is", "the", "and", "in".
+            - Spanish: "es", "el", "y", "en".
+            #### Why Remove Stop Words?
+            - To reduce noise in text data.
+            #### Applications:
+            - Sentiment analysis, text classification, and search engines.
+            #### Challenges:
+            - Some stop words might carry context-specific importance.
+            """
+        )
+query_params = st.experimental_get_query_params()
+page = query_params.get("page", ["home"])[0]
+if page == "home":
+    show_home_page()
+else:
+    show_page(page)