Spaces:

sree4411
/

NLP

Sleeping

App Files Files Community

sree4411 commited on Feb 5, 2025

Commit

381b8fe

verified ·

1 Parent(s): c8f69b7

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -96

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import streamlit as st
-from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-import numpy as np
 from gensim.models import Word2Vec
 # Title
@@ -9,100 +7,119 @@ st.title("Introduction to NLP")
 # Section: What is NLP?
 st.header("What is NLP?")
 st.write("""
-Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a way that is valuable.
-Key components of NLP include:
-- **Tokenization**: Breaking down text into individual words or phrases.
-- **Sentiment Analysis**: Determining the sentiment expressed in a piece of text.
-- **Named Entity Recognition (NER)**: Identifying entities like names, dates, and locations in text.
-- **Machine Translation**: Automatically translating text from one language to another.
 """)
-# Section: Life Cycle of NLP
-st.header("Life Cycle of NLP")
-steps = [
-    "Text Collection",
-    "Text Preprocessing",
-    "Text Representation",
-    "Modeling and Analysis",
-    "Evaluation",
-    "Deployment",
-    "Maintenance and Updates"
-]
-selected_step = st.radio("Select a step to learn more:", steps)
-if selected_step == "Text Collection":
-    st.write("""
-    **Text Collection**: Gathering text data from various sources like websites, social media, etc.
-    """)
-elif selected_step == "Text Preprocessing":
-    st.write("""
-    **Text Preprocessing**: Cleaning and preparing text data, including tokenization, stop word removal, and stemming/lemmatization.
-    """)
-elif selected_step == "Text Representation":
-    st.write("""
-    **Text Representation**: Converting text into numerical representations using methods like Bag of Words, TF-IDF, or word embeddings.
-    """)
-elif selected_step == "Modeling and Analysis":
-    st.write("""
-    **Modeling and Analysis**: Training NLP models for tasks such as sentiment analysis, named entity recognition, and text classification.
-    """)
-elif selected_step == "Evaluation":
-    st.write("""
-    **Evaluation**: Assessing model performance using metrics like accuracy, precision, recall, and F1-score.
-    """)
-elif selected_step == "Deployment":
-    st.write("""
-    **Deployment**: Integrating models into applications and monitoring their performance in real-world scenarios.
-    """)
-elif selected_step == "Maintenance and Updates":
-    st.write("""
-    **Maintenance and Updates**: Continuously improving models with new data and feedback from users.
-    """)
 # Section: Text Representation Methods
 st.header("Text Representation Methods")
 methods = [
     "Bag of Words",
     "TF-IDF",
-    "Word Embeddings (Word2Vec)",
-    "n-grams",
-    "One-Hot Encoding"
 ]
 selected_method = st.radio("Select a text representation method:", methods)
-# Sample Texts
-texts = [
-    "Natural Language Processing is fascinating.",
-    "Natural Language Processing involves understanding human language.",
-    "The field of NLP is growing rapidly."
-]
 if selected_method == "Bag of Words":
     st.subheader("Bag of Words (BoW)")
-    st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
-    st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
-    vectorizer = CountVectorizer()
-    X_bow = vectorizer.fit_transform(texts)
-    st.write("Feature Names:", vectorizer.get_feature_names_out())
-    st.write("Bag of Words Representation:")
-    st.write(X_bow.toarray())
 elif selected_method == "TF-IDF":
     st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
-    st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
-    st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
-    tfidf_vectorizer = TfidfVectorizer()
-    X_tfidf = tfidf_vectorizer.fit_transform(texts)
-    st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
-    st.write("TF-IDF Representation:")
-    st.write(X_tfidf.toarray())
 elif selected_method == "Word Embeddings (Word2Vec)":
-    st.subheader("Word Embeddings")
-    st.write("Converts words into dense vectors capturing semantic relationships.")
-    st.write("Common models include Word2Vec, GloVe, and FastText.")
-    st.write("Example: 'king' - 'man' + 'woman' ≈ 'queen'.")
     model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
     word_vectors = model.wv
     word = 'natural'
@@ -112,22 +129,6 @@ elif selected_method == "Word Embeddings (Word2Vec)":
     else:
         st.write(f"Word '{word}' not found in the vocabulary.")
-elif selected_method == "n-grams":
-    st.subheader("n-grams")
-    st.write("Captures sequences of n words to consider word order.")
-    st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
-    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
-    X_bigrams = bigram_vectorizer.fit_transform(texts)
-    st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
-    st.write("n-grams Representation:")
-    st.write(X_bigrams.toarray())
-elif selected_method == "One-Hot Encoding":
-    st.subheader("One-Hot Encoding")
-    st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
-    st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
-    one_hot_vectorizer = CountVectorizer(binary=True)
-    X_one_hot = one_hot_vectorizer.fit_transform(texts)
-    st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
-    st.write("One-Hot Encoding Representation:")
-    st.write(X_one_hot.toarray())

 import streamlit as st
 from gensim.models import Word2Vec
 # Title
 # Section: What is NLP?
 st.header("What is NLP?")
 st.write("""
+Natural Language Processing (NLP) is a subfield of artificial intelligence that enables computers to process, understand, and generate human language.
+### Applications of NLP:
+- **Chatbots & Virtual Assistants** (e.g., Siri, Alexa)
+- **Sentiment Analysis** (e.g., Product reviews, Social Media monitoring)
+- **Machine Translation** (e.g., Google Translate)
+- **Text Summarization** (e.g., News article summaries)
+- **Speech Recognition** (e.g., Voice commands)
 """)
+# Section: NLP Terminologies
+st.header("NLP Terminologies")
+st.write("""
+- **Corpus**: A collection of text documents used for NLP tasks.
+- **Tokenization**: Splitting text into individual words or phrases.
+- **Stop Words**: Common words (e.g., "the", "is") that are often removed.
+- **Stemming**: Reducing words to their base form (e.g., "running" → "run").
+- **Lemmatization**: More advanced than stemming; it converts words to their dictionary form.
+- **Named Entity Recognition (NER)**: Identifies entities like names, dates, and locations.
+- **Sentiment Analysis**: Determines the sentiment (positive, negative, neutral) of a text.
+- **n-grams**: Sequences of 'n' consecutive words (e.g., "New York" is a bi-gram).
+""")
 # Section: Text Representation Methods
 st.header("Text Representation Methods")
 methods = [
     "Bag of Words",
     "TF-IDF",
+    "One-Hot Encoding",
+    "Word Embeddings (Word2Vec)"
 ]
 selected_method = st.radio("Select a text representation method:", methods)
 if selected_method == "Bag of Words":
     st.subheader("Bag of Words (BoW)")
+    st.write("""
+    **Definition**: Represents text as a collection of word counts, ignoring grammar and word order.
+    **Uses**:
+    - Sentiment analysis
+    - Document classification
+    - Information retrieval
+    **Advantages**:
+    ✅ Simple and easy to implement
+    ✅ Works well with traditional ML models
+    **Disadvantages**:
+    ❌ Ignores word order and context
+    ❌ High-dimensionality for large vocabularies
+    """)
 elif selected_method == "TF-IDF":
     st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
+    st.write("""
+    **Definition**: Weighs words based on their frequency in a document and across all documents.
+    **Uses**:
+    - Information retrieval (e.g., search engines)
+    - Text classification
+    - Keyword extraction
+    **Advantages**:
+    ✅ Reduces the impact of common words
+    ✅ Highlights important words
+    **Disadvantages**:
+    ❌ Still ignores word order
+    ❌ Does not capture deep semantics
+    """)
+elif selected_method == "One-Hot Encoding":
+    st.subheader("One-Hot Encoding")
+    st.write("""
+    **Definition**: Represents words as binary vectors where each word has a unique position in a vocabulary.
+    **Uses**:
+    - Simple NLP tasks
+    - Word-level feature engineering
+    **Advantages**:
+    ✅ Simple to understand
+    ✅ Works well with small vocabulary sizes
+    **Disadvantages**:
+    ❌ Inefficient for large vocabularies
+    ❌ No information on word meaning
+    """)
 elif selected_method == "Word Embeddings (Word2Vec)":
+    st.subheader("Word Embeddings (Word2Vec)")
+    st.write("""
+    **Definition**: Converts words into dense numerical vectors capturing semantic relationships.
+    **Uses**:
+    - Machine translation
+    - Speech recognition
+    - Sentiment analysis
+    **Advantages**:
+    ✅ Captures semantic relationships
+    ✅ Works well for deep learning models
+    **Disadvantages**:
+    ❌ Requires large datasets to train
+    ❌ Computationally expensive
+    """)
+    # Sample texts
+    texts = [
+        "Natural Language Processing is fascinating.",
+        "Natural Language Processing involves understanding human language.",
+        "The field of NLP is growing rapidly."
+    ]
     model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
     word_vectors = model.wv
     word = 'natural'
     else:
         st.write(f"Word '{word}' not found in the vocabulary.")
+# Footer
+st.write("---")
+st.write("Developed with ❤️ using Streamlit for NLP enthusiasts.")