sree4411 commited on
Commit
381b8fe
Β·
verified Β·
1 Parent(s): c8f69b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -96
app.py CHANGED
@@ -1,6 +1,4 @@
1
  import streamlit as st
2
- from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
3
- import numpy as np
4
  from gensim.models import Word2Vec
5
 
6
  # Title
@@ -9,100 +7,119 @@ st.title("Introduction to NLP")
9
  # Section: What is NLP?
10
  st.header("What is NLP?")
11
  st.write("""
12
- Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a way that is valuable.
13
 
14
- Key components of NLP include:
15
- - **Tokenization**: Breaking down text into individual words or phrases.
16
- - **Sentiment Analysis**: Determining the sentiment expressed in a piece of text.
17
- - **Named Entity Recognition (NER)**: Identifying entities like names, dates, and locations in text.
18
- - **Machine Translation**: Automatically translating text from one language to another.
 
19
  """)
20
 
21
- # Section: Life Cycle of NLP
22
- st.header("Life Cycle of NLP")
23
- steps = [
24
- "Text Collection",
25
- "Text Preprocessing",
26
- "Text Representation",
27
- "Modeling and Analysis",
28
- "Evaluation",
29
- "Deployment",
30
- "Maintenance and Updates"
31
- ]
32
- selected_step = st.radio("Select a step to learn more:", steps)
33
-
34
- if selected_step == "Text Collection":
35
- st.write("""
36
- **Text Collection**: Gathering text data from various sources like websites, social media, etc.
37
- """)
38
- elif selected_step == "Text Preprocessing":
39
- st.write("""
40
- **Text Preprocessing**: Cleaning and preparing text data, including tokenization, stop word removal, and stemming/lemmatization.
41
- """)
42
- elif selected_step == "Text Representation":
43
- st.write("""
44
- **Text Representation**: Converting text into numerical representations using methods like Bag of Words, TF-IDF, or word embeddings.
45
- """)
46
- elif selected_step == "Modeling and Analysis":
47
- st.write("""
48
- **Modeling and Analysis**: Training NLP models for tasks such as sentiment analysis, named entity recognition, and text classification.
49
- """)
50
- elif selected_step == "Evaluation":
51
- st.write("""
52
- **Evaluation**: Assessing model performance using metrics like accuracy, precision, recall, and F1-score.
53
- """)
54
- elif selected_step == "Deployment":
55
- st.write("""
56
- **Deployment**: Integrating models into applications and monitoring their performance in real-world scenarios.
57
- """)
58
- elif selected_step == "Maintenance and Updates":
59
- st.write("""
60
- **Maintenance and Updates**: Continuously improving models with new data and feedback from users.
61
- """)
62
 
63
  # Section: Text Representation Methods
64
  st.header("Text Representation Methods")
65
  methods = [
66
  "Bag of Words",
67
  "TF-IDF",
68
- "Word Embeddings (Word2Vec)",
69
- "n-grams",
70
- "One-Hot Encoding"
71
  ]
72
  selected_method = st.radio("Select a text representation method:", methods)
73
 
74
- # Sample Texts
75
- texts = [
76
- "Natural Language Processing is fascinating.",
77
- "Natural Language Processing involves understanding human language.",
78
- "The field of NLP is growing rapidly."
79
- ]
80
-
81
  if selected_method == "Bag of Words":
82
  st.subheader("Bag of Words (BoW)")
83
- st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
84
- st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
85
- vectorizer = CountVectorizer()
86
- X_bow = vectorizer.fit_transform(texts)
87
- st.write("Feature Names:", vectorizer.get_feature_names_out())
88
- st.write("Bag of Words Representation:")
89
- st.write(X_bow.toarray())
 
 
 
 
 
 
 
 
 
90
 
91
  elif selected_method == "TF-IDF":
92
  st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
93
- st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
94
- st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
95
- tfidf_vectorizer = TfidfVectorizer()
96
- X_tfidf = tfidf_vectorizer.fit_transform(texts)
97
- st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
98
- st.write("TF-IDF Representation:")
99
- st.write(X_tfidf.toarray())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  elif selected_method == "Word Embeddings (Word2Vec)":
102
- st.subheader("Word Embeddings")
103
- st.write("Converts words into dense vectors capturing semantic relationships.")
104
- st.write("Common models include Word2Vec, GloVe, and FastText.")
105
- st.write("Example: 'king' - 'man' + 'woman' β‰ˆ 'queen'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
107
  word_vectors = model.wv
108
  word = 'natural'
@@ -112,22 +129,6 @@ elif selected_method == "Word Embeddings (Word2Vec)":
112
  else:
113
  st.write(f"Word '{word}' not found in the vocabulary.")
114
 
115
- elif selected_method == "n-grams":
116
- st.subheader("n-grams")
117
- st.write("Captures sequences of n words to consider word order.")
118
- st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
119
- bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
120
- X_bigrams = bigram_vectorizer.fit_transform(texts)
121
- st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
122
- st.write("n-grams Representation:")
123
- st.write(X_bigrams.toarray())
124
-
125
- elif selected_method == "One-Hot Encoding":
126
- st.subheader("One-Hot Encoding")
127
- st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
128
- st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
129
- one_hot_vectorizer = CountVectorizer(binary=True)
130
- X_one_hot = one_hot_vectorizer.fit_transform(texts)
131
- st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
132
- st.write("One-Hot Encoding Representation:")
133
- st.write(X_one_hot.toarray())
 
1
  import streamlit as st
 
 
2
  from gensim.models import Word2Vec
3
 
4
  # Title
 
7
  # Section: What is NLP?
8
  st.header("What is NLP?")
9
  st.write("""
10
+ Natural Language Processing (NLP) is a subfield of artificial intelligence that enables computers to process, understand, and generate human language.
11
 
12
+ ### Applications of NLP:
13
+ - **Chatbots & Virtual Assistants** (e.g., Siri, Alexa)
14
+ - **Sentiment Analysis** (e.g., Product reviews, Social Media monitoring)
15
+ - **Machine Translation** (e.g., Google Translate)
16
+ - **Text Summarization** (e.g., News article summaries)
17
+ - **Speech Recognition** (e.g., Voice commands)
18
  """)
19
 
20
+ # Section: NLP Terminologies
21
+ st.header("NLP Terminologies")
22
+ st.write("""
23
+ - **Corpus**: A collection of text documents used for NLP tasks.
24
+ - **Tokenization**: Splitting text into individual words or phrases.
25
+ - **Stop Words**: Common words (e.g., "the", "is") that are often removed.
26
+ - **Stemming**: Reducing words to their base form (e.g., "running" β†’ "run").
27
+ - **Lemmatization**: More advanced than stemming; it converts words to their dictionary form.
28
+ - **Named Entity Recognition (NER)**: Identifies entities like names, dates, and locations.
29
+ - **Sentiment Analysis**: Determines the sentiment (positive, negative, neutral) of a text.
30
+ - **n-grams**: Sequences of 'n' consecutive words (e.g., "New York" is a bi-gram).
31
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Section: Text Representation Methods
34
  st.header("Text Representation Methods")
35
  methods = [
36
  "Bag of Words",
37
  "TF-IDF",
38
+ "One-Hot Encoding",
39
+ "Word Embeddings (Word2Vec)"
 
40
  ]
41
  selected_method = st.radio("Select a text representation method:", methods)
42
 
 
 
 
 
 
 
 
43
  if selected_method == "Bag of Words":
44
  st.subheader("Bag of Words (BoW)")
45
+ st.write("""
46
+ **Definition**: Represents text as a collection of word counts, ignoring grammar and word order.
47
+
48
+ **Uses**:
49
+ - Sentiment analysis
50
+ - Document classification
51
+ - Information retrieval
52
+
53
+ **Advantages**:
54
+ βœ… Simple and easy to implement
55
+ βœ… Works well with traditional ML models
56
+
57
+ **Disadvantages**:
58
+ ❌ Ignores word order and context
59
+ ❌ High-dimensionality for large vocabularies
60
+ """)
61
 
62
  elif selected_method == "TF-IDF":
63
  st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
64
+ st.write("""
65
+ **Definition**: Weighs words based on their frequency in a document and across all documents.
66
+
67
+ **Uses**:
68
+ - Information retrieval (e.g., search engines)
69
+ - Text classification
70
+ - Keyword extraction
71
+
72
+ **Advantages**:
73
+ βœ… Reduces the impact of common words
74
+ βœ… Highlights important words
75
+
76
+ **Disadvantages**:
77
+ ❌ Still ignores word order
78
+ ❌ Does not capture deep semantics
79
+ """)
80
+
81
+ elif selected_method == "One-Hot Encoding":
82
+ st.subheader("One-Hot Encoding")
83
+ st.write("""
84
+ **Definition**: Represents words as binary vectors where each word has a unique position in a vocabulary.
85
+
86
+ **Uses**:
87
+ - Simple NLP tasks
88
+ - Word-level feature engineering
89
+
90
+ **Advantages**:
91
+ βœ… Simple to understand
92
+ βœ… Works well with small vocabulary sizes
93
+
94
+ **Disadvantages**:
95
+ ❌ Inefficient for large vocabularies
96
+ ❌ No information on word meaning
97
+ """)
98
 
99
  elif selected_method == "Word Embeddings (Word2Vec)":
100
+ st.subheader("Word Embeddings (Word2Vec)")
101
+ st.write("""
102
+ **Definition**: Converts words into dense numerical vectors capturing semantic relationships.
103
+
104
+ **Uses**:
105
+ - Machine translation
106
+ - Speech recognition
107
+ - Sentiment analysis
108
+
109
+ **Advantages**:
110
+ βœ… Captures semantic relationships
111
+ βœ… Works well for deep learning models
112
+
113
+ **Disadvantages**:
114
+ ❌ Requires large datasets to train
115
+ ❌ Computationally expensive
116
+ """)
117
+ # Sample texts
118
+ texts = [
119
+ "Natural Language Processing is fascinating.",
120
+ "Natural Language Processing involves understanding human language.",
121
+ "The field of NLP is growing rapidly."
122
+ ]
123
  model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
124
  word_vectors = model.wv
125
  word = 'natural'
 
129
  else:
130
  st.write(f"Word '{word}' not found in the vocabulary.")
131
 
132
+ # Footer
133
+ st.write("---")
134
+ st.write("Developed with ❀️ using Streamlit for NLP enthusiasts.")