Spaces:

Gowthamvemula
/

nlp_machine

Sleeping

App Files Files Community

Update app.py

by Mpavan45 - opened Dec 23, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+246

-245

Files changed (1) hide show

app.py +246 -245

app.py CHANGED Viewed

@@ -306,255 +306,256 @@ elif selected_page == "🔄 Lifecycle of NLP":
 elif selected_page == "⚙️ NLP Techniques":
     st.header("⚙️ NLP Techniques")
-    if selected_subpoint == "Tokenization":
-        st.write("""
-            Breaking down text into smaller units such as words or sentences to make it manageable for analysis.
-            **Example:**
-            - Input: `"Artificial Intelligence is fascinating."`
-            - Word Tokens: `["Artificial", "Intelligence", "is", "fascinating", "."]`
-            - Sentence Tokens: `["Artificial Intelligence is fascinating."]`
             """)
-    elif selected_subpoint == "Stemming":
-        st.write("### 🌱 Stemming")
-        st.write("""
-            Stemming reduces words to their root form by removing prefixes or suffixes, often resulting in a non-grammatical base.
-            **Example:**
-            - Input: `["running", "runner", "runs"]`
-            - Output: `["run", "runner", "run"]` (Porter Stemmer)
-            **Key Points:**
-            - **Fast** and **simple**, but can lead to over-stemming or under-stemming.
-            - Example of over-stemming: `"generous"` → `"gener"`
-            **Code Example:**
-            ```python
-            from nltk.stem import PorterStemmer
-            stemmer = PorterStemmer()
-            words = ["running", "runner", "runs"]
-            print([stemmer.stem(word) for word in words])
-            # Output: ['run', 'runner', 'run']
-            ```
-        """)
-    elif selected_subpoint == " Lemmatization":
-        st.write("### 🌿 Lemmatization")
-        st.write("""
-            Lemmatization reduces words to their dictionary base form (lemma), ensuring grammatical correctness.
-            **Example:**
-            - Input: `["running", "ran", "better"]`
-            - Output: `["run", "run", "good"]`
-            **Key Points:**
-            - Context-aware and accurate.
-            - More computationally intensive than stemming.
-            **Code Example:**
-            ```python
-            from nltk.stem import WordNetLemmatizer
-            lemmatizer = WordNetLemmatizer()
-            words = ["running", "ran", "better"]
-            print([lemmatizer.lemmatize(word, pos="v") for word in words])
-            ```
-        """)
-    elif selected_subpoint == " stop Words":
-        st.write("### 🚫 Stop Words")
-        st.write("""
-            Stop words are common words (e.g., *the*, *is*) that are removed during text processing as they don't add much meaning.
-            **Example:**
-            - Input: `"This is a simple sentence."`
-            - Output: `"simple sentence"`
-            **Code Example:**
-            ```python
-            from nltk.corpus import stopwords
-            stop_words = set(stopwords.words("english"))
-            sentence = "This is a simple sentence."
-            filtered_sentence = [word for word in sentence.split() if word.lower() not in stop_words]
-            print(filtered_sentence)  # Output: ['simple', 'sentence']
-            ```
-        """)
-    elif selected_subpoint == " One Hot Encoding":
-        st.write("### 🔥 One-Hot Encoding")
-        st.write("""
-            Representing categorical data as binary vectors to make it suitable for machine learning models.
-            **How it works:**
-            - Each unique category is assigned a unique binary vector.
-            - A binary vector has all values as `0` except for the position representing the category, which is `1`.
-            **Example:**
-            - Categories: `["Apple", "Banana", "Cherry"]`
-            - Encoding:
-              - `Apple`: `[1, 0, 0]`
-              - `Banana`: `[0, 1, 0]`
-              - `Cherry`: `[0, 0, 1]`
-        """)
-        st.write("### 🍎 Example with Fruits")
-        st.write("""
-            **Input Categories:** `["Apple", "Banana", "Cherry", "Banana", "Apple"]`
-            **Output (One-Hot Encoding):**
-            - `Apple`: `[1, 0, 0]`
-            - `Banana`: `[0, 1, 0]`
-            - `Cherry`: `[0, 0, 1]`
-            - `Banana`: `[0, 1, 0]`
-            - `Apple`: `[1, 0, 0]`
-        """)
-    elif selected_subpoint == " Bag Of Words":
-        st.write("### 👜 Bag of Words (BoW)")
-        st.write("""
-            Bag of Words converts text into a matrix of word frequencies, where each word is represented by a unique index in the vocabulary.
-            **Example:**
-            - Input: `["I love NLP", "NLP is fun"]`
-            - Vocabulary: `["I", "love", "NLP", "is", "fun"]`
-            - BoW Matrix:
-              - `"I love NLP"`: `[1, 1, 1, 0, 0]`
-              - `"NLP is fun"`: `[0, 0, 1, 1, 1]`
-            **Code Example:**
-            ```python
-            from sklearn.feature_extraction.text import CountVectorizer
-            documents = ["I love NLP", "NLP is fun"]
-            vectorizer = CountVectorizer()
-            bow_matrix = vectorizer.fit_transform(documents)
-            print(bow_matrix.toarray())  # Output: [[1, 1, 1, 0, 0], [0, 0, 1, 1, 1]]
-            ```
-        """)
-    elif selected_subpoint == " Binary Bag Of Words":
-        st.write("### 🔲 Binary Bag of Words")
-        st.write("""
-            Binary Bag of Words is a variation of the BoW model where each word is represented by `1` if present in the document and `0` if absent, ignoring word frequencies.
-            **Example:**
-            - Input: `["I love NLP", "NLP is fun"]`
-            - Vocabulary: `["I", "love", "NLP", "is", "fun"]`
-            - Binary BoW Matrix:
-              - `"I love NLP"`: `[1, 1, 1, 0, 0]`
-              - `"NLP is fun"`: `[0, 0, 1, 1, 1]`
-            **Code Example:**
-            ```python
-            from sklearn.feature_extraction.text import CountVectorizer
-            documents = ["I love NLP", "NLP is fun"]
-            vectorizer = CountVectorizer(binary=True)
-            binary_bow_matrix = vectorizer.fit_transform(documents)
-            print(binary_bow_matrix.toarray())  # Output: [[1, 1, 1, 0, 0], [0, 0, 1, 1, 1]]
-            ```
-        """)
-    elif selected_subpoint == " TF-IDF":
-        st.write("### 🧮 TF-IDF (Term Frequency - Inverse Document Frequency)")
-        st.write("""
-            TF-IDF is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. It considers two factors:
-            - **Term Frequency (TF)**: The frequency of a word in a document.
-            - **Inverse Document Frequency (IDF)**: The importance of the word across all documents in the corpus. Words that appear in many documents are less important.
-            The formula for TF-IDF is:
-            - **TF-IDF = TF * IDF**
-            **Example:**
-            Consider three documents:
-            1. `"I love programming"`
-            2. `"Programming is fun"`
-            3. `"I love Python programming"`
-            - **TF (for "programming")**:
-              - Document 1: `1/3`
-              - Document 2: `1/3`
-              - Document 3: `1/3`
-            - **IDF (for "programming")**:
-              - IDF = log(3/3) = 0 (common word, less informative)
-            **Code Example:**
-            ```python
-            from sklearn.feature_extraction.text import TfidfVectorizer
-            documents = ["I love programming", "Programming is fun", "I love Python programming"]
-            vectorizer = TfidfVectorizer()
-            tfidf_matrix = vectorizer.fit_transform(documents)
-            print(tfidf_matrix.toarray())  # Output will show TF-IDF scores for each word in each document
-            ```
-        """)
-    elif selected_subpoint == " Word Embeddings":
-        st.write("### 🤖 Word Embeddings")
-        st.write("""
-            Word embeddings are dense vector representations of words in a continuous vector space, capturing semantic meanings and relationships between words.
-            **Types of Word Embeddings:**
-            1. **Word2Vec**: Learns word associations from context using two approaches: Word2Vec is a model that transforms words into dense vector representations in a continuous vector space, capturing semantic relationships. It learns these representations by predicting words based on their context.
-                - **Skip-gram model**: This model predicts context words from a target word, similar to Word2Vec's Skip-gram model. It's useful for capturing word relationships, and it works well with smaller datasets.
-                - **CBOW (Continuous Bag of Words) model**: This model predicts a target word from a context window of words, similar to Word2Vec's CBOW model. It's effective for larger datasets and works well when words occur frequently.
-            2. **GloVe (Global Vectors for Word Representation)**: Uses a co-occurrence matrix to capture the relationships between words. It factors the matrix to produce low-dimensional vectors.
-            3. **FastText**: Extends Word2Vec by breaking words into subword units, which helps capture morphology and represent rare or unseen words.
-            **Example:**
-            - Words like `"king"` and `"queen"` will have similar vector representations in embedding space, reflecting their semantic relationship.
-            **Code Example (using Word2Vec):**
-            ```python
-            from gensim.models import Word2Vec
-            # Sample sentences
-            sentences = [["I", "love", "programming"], ["Word", "embeddings", "are", "cool"]]
-            # Train Word2Vec model
-            model = Word2Vec(sentences, min_count=1)
-            # Get the vector for the word 'programming'
-            vector = model.wv['programming']
-            print(vector)
-            ```
-        """)
-    elif selected_subpoint == "Part-of-Speech (POS) Tagging":
-        st.write("### 🖇️ Part-of-Speech (POS) Tagging")
-        st.write("""
-            Assigning grammatical labels to each word in a sentence, indicating its role in context.
-            **Example:**
-            - Input: `"Birds fly high"`
-            - Output: `["Birds (NOUN)", "fly (VERB)", "high (ADJ)"]`
-        """)
-    elif selected_subpoint == "Named Entity Recognition (NER)":
-        st.write("### 🌍 Named Entity Recognition (NER)")
-        st.write("""
-            Detecting and categorizing entities like names, dates, and locations from text.
-            **Example:**
-            - Input: `"Tesla, founded by Elon Musk, is based in California."`
-            - Output: `["Tesla (ORGANIZATION)", "Elon Musk (PERSON)", "California (LOCATION)"]`
-        """)
-    elif selected_subpoint == "Sentiment Analysis":
-        st.write("### 🎭 Sentiment Analysis")
-        st.write("""
-            Classifying the emotional tone of a text into categories such as positive, negative, or neutral.
-            **Example:**
-            - Input: `"The service was exceptional!"`
-            - Output: `Positive`
-        """)

 elif selected_page == "⚙️ NLP Techniques":
     st.header("⚙️ NLP Techniques")
+    if selected_subpoint:
+        if selected_subpoint == "Tokenization":
+            st.write("""
+                Breaking down text into smaller units such as words or sentences to make it manageable for analysis.
+                **Example:**
+                - Input: `"Artificial Intelligence is fascinating."`
+                - Word Tokens: `["Artificial", "Intelligence", "is", "fascinating", "."]`
+                - Sentence Tokens: `["Artificial Intelligence is fascinating."]`
+                """)
+        elif selected_subpoint == "Stemming":
+            st.write("### 🌱 Stemming")
+            st.write("""
+                Stemming reduces words to their root form by removing prefixes or suffixes, often resulting in a non-grammatical base.
+                **Example:**
+                - Input: `["running", "runner", "runs"]`
+                - Output: `["run", "runner", "run"]` (Porter Stemmer)
+                **Key Points:**
+                - **Fast** and **simple**, but can lead to over-stemming or under-stemming.
+                - Example of over-stemming: `"generous"` → `"gener"`
+                **Code Example:**
+                ```python
+                from nltk.stem import PorterStemmer
+                stemmer = PorterStemmer()
+                words = ["running", "runner", "runs"]
+                print([stemmer.stem(word) for word in words])
+                # Output: ['run', 'runner', 'run']
+                ```
             """)
+        elif selected_subpoint == " Lemmatization":
+            st.write("### 🌿 Lemmatization")
+            st.write("""
+                Lemmatization reduces words to their dictionary base form (lemma), ensuring grammatical correctness.
+                **Example:**
+                - Input: `["running", "ran", "better"]`
+                - Output: `["run", "run", "good"]`
+                **Key Points:**
+                - Context-aware and accurate.
+                - More computationally intensive than stemming.
+                **Code Example:**
+                ```python
+                from nltk.stem import WordNetLemmatizer
+                lemmatizer = WordNetLemmatizer()
+                words = ["running", "ran", "better"]
+                print([lemmatizer.lemmatize(word, pos="v") for word in words])
+                ```
+            """)
+        elif selected_subpoint == " stop Words":
+            st.write("### 🚫 Stop Words")
+            st.write("""
+                Stop words are common words (e.g., *the*, *is*) that are removed during text processing as they don't add much meaning.
+                **Example:**
+                - Input: `"This is a simple sentence."`
+                - Output: `"simple sentence"`
+                **Code Example:**
+                ```python
+                from nltk.corpus import stopwords
+                stop_words = set(stopwords.words("english"))
+                sentence = "This is a simple sentence."
+                filtered_sentence = [word for word in sentence.split() if word.lower() not in stop_words]
+                print(filtered_sentence)  # Output: ['simple', 'sentence']
+                ```
+            """)
+        elif selected_subpoint == " One Hot Encoding":
+            st.write("### 🔥 One-Hot Encoding")
+            st.write("""
+                Representing categorical data as binary vectors to make it suitable for machine learning models.
+                **How it works:**
+                - Each unique category is assigned a unique binary vector.
+                - A binary vector has all values as `0` except for the position representing the category, which is `1`.
+                **Example:**
+                - Categories: `["Apple", "Banana", "Cherry"]`
+                - Encoding:
+                  - `Apple`: `[1, 0, 0]`
+                  - `Banana`: `[0, 1, 0]`
+                  - `Cherry`: `[0, 0, 1]`
+            """)
+            st.write("### 🍎 Example with Fruits")
+            st.write("""
+                **Input Categories:** `["Apple", "Banana", "Cherry", "Banana", "Apple"]`
+                **Output (One-Hot Encoding):**
+                - `Apple`: `[1, 0, 0]`
+                - `Banana`: `[0, 1, 0]`
+                - `Cherry`: `[0, 0, 1]`
+                - `Banana`: `[0, 1, 0]`
+                - `Apple`: `[1, 0, 0]`
+            """)
+        elif selected_subpoint == " Bag Of Words":
+            st.write("### 👜 Bag of Words (BoW)")
+            st.write("""
+                Bag of Words converts text into a matrix of word frequencies, where each word is represented by a unique index in the vocabulary.
+                **Example:**
+                - Input: `["I love NLP", "NLP is fun"]`
+                - Vocabulary: `["I", "love", "NLP", "is", "fun"]`
+                - BoW Matrix:
+                  - `"I love NLP"`: `[1, 1, 1, 0, 0]`
+                  - `"NLP is fun"`: `[0, 0, 1, 1, 1]`
+                **Code Example:**
+                ```python
+                from sklearn.feature_extraction.text import CountVectorizer
+                documents = ["I love NLP", "NLP is fun"]
+                vectorizer = CountVectorizer()
+                bow_matrix = vectorizer.fit_transform(documents)
+                print(bow_matrix.toarray())  # Output: [[1, 1, 1, 0, 0], [0, 0, 1, 1, 1]]
+                ```
+            """)
+        elif selected_subpoint == " Binary Bag Of Words":
+            st.write("### 🔲 Binary Bag of Words")
+            st.write("""
+                Binary Bag of Words is a variation of the BoW model where each word is represented by `1` if present in the document and `0` if absent, ignoring word frequencies.
+                **Example:**
+                - Input: `["I love NLP", "NLP is fun"]`
+                - Vocabulary: `["I", "love", "NLP", "is", "fun"]`
+                - Binary BoW Matrix:
+                  - `"I love NLP"`: `[1, 1, 1, 0, 0]`
+                  - `"NLP is fun"`: `[0, 0, 1, 1, 1]`
+                **Code Example:**
+                ```python
+                from sklearn.feature_extraction.text import CountVectorizer
+                documents = ["I love NLP", "NLP is fun"]
+                vectorizer = CountVectorizer(binary=True)
+                binary_bow_matrix = vectorizer.fit_transform(documents)
+                print(binary_bow_matrix.toarray())  # Output: [[1, 1, 1, 0, 0], [0, 0, 1, 1, 1]]
+                ```
+            """)
+        elif selected_subpoint == " TF-IDF":
+            st.write("### 🧮 TF-IDF (Term Frequency - Inverse Document Frequency)")
+            st.write("""
+                TF-IDF is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. It considers two factors:
+                - **Term Frequency (TF)**: The frequency of a word in a document.
+                - **Inverse Document Frequency (IDF)**: The importance of the word across all documents in the corpus. Words that appear in many documents are less important.
+                The formula for TF-IDF is:
+                - **TF-IDF = TF * IDF**
+                **Example:**
+                Consider three documents:
+                1. `"I love programming"`
+                2. `"Programming is fun"`
+                3. `"I love Python programming"`
+                - **TF (for "programming")**:
+                  - Document 1: `1/3`
+                  - Document 2: `1/3`
+                  - Document 3: `1/3`
+                - **IDF (for "programming")**:
+                  - IDF = log(3/3) = 0 (common word, less informative)
+                **Code Example:**
+                ```python
+                from sklearn.feature_extraction.text import TfidfVectorizer
+                documents = ["I love programming", "Programming is fun", "I love Python programming"]
+                vectorizer = TfidfVectorizer()
+                tfidf_matrix = vectorizer.fit_transform(documents)
+                print(tfidf_matrix.toarray())  # Output will show TF-IDF scores for each word in each document
+                ```
+            """)
+        elif selected_subpoint == " Word Embeddings":
+            st.write("### 🤖 Word Embeddings")
+            st.write("""
+                Word embeddings are dense vector representations of words in a continuous vector space, capturing semantic meanings and relationships between words.
+                **Types of Word Embeddings:**
+                1. **Word2Vec**: Learns word associations from context using two approaches: Word2Vec is a model that transforms words into dense vector representations in a continuous vector space, capturing semantic relationships. It learns these representations by predicting words based on their context.
+                    - **Skip-gram model**: This model predicts context words from a target word, similar to Word2Vec's Skip-gram model. It's useful for capturing word relationships, and it works well with smaller datasets.
+                    - **CBOW (Continuous Bag of Words) model**: This model predicts a target word from a context window of words, similar to Word2Vec's CBOW model. It's effective for larger datasets and works well when words occur frequently.
+                2. **GloVe (Global Vectors for Word Representation)**: Uses a co-occurrence matrix to capture the relationships between words. It factors the matrix to produce low-dimensional vectors.
+                3. **FastText**: Extends Word2Vec by breaking words into subword units, which helps capture morphology and represent rare or unseen words.
+                **Example:**
+                - Words like `"king"` and `"queen"` will have similar vector representations in embedding space, reflecting their semantic relationship.
+                **Code Example (using Word2Vec):**
+                ```python
+                from gensim.models import Word2Vec
+                # Sample sentences
+                sentences = [["I", "love", "programming"], ["Word", "embeddings", "are", "cool"]]
+                # Train Word2Vec model
+                model = Word2Vec(sentences, min_count=1)
+                # Get the vector for the word 'programming'
+                vector = model.wv['programming']
+                print(vector)
+                ```
+            """)
+        elif selected_subpoint == "Part-of-Speech (POS) Tagging":
+            st.write("### 🖇️ Part-of-Speech (POS) Tagging")
+            st.write("""
+                Assigning grammatical labels to each word in a sentence, indicating its role in context.
+                **Example:**
+                - Input: `"Birds fly high"`
+                - Output: `["Birds (NOUN)", "fly (VERB)", "high (ADJ)"]`
+            """)
+        elif selected_subpoint == "Named Entity Recognition (NER)":
+            st.write("### 🌍 Named Entity Recognition (NER)")
+            st.write("""
+                Detecting and categorizing entities like names, dates, and locations from text.
+                **Example:**
+                - Input: `"Tesla, founded by Elon Musk, is based in California."`
+                - Output: `["Tesla (ORGANIZATION)", "Elon Musk (PERSON)", "California (LOCATION)"]`
+            """)
+        elif selected_subpoint == "Sentiment Analysis":
+            st.write("### 🎭 Sentiment Analysis")
+            st.write("""
+                Classifying the emotional tone of a text into categories such as positive, negative, or neutral.
+                **Example:**
+                - Input: `"The service was exceptional!"`
+                - Output: `Positive`
+            """)