sree4411 commited on
Commit
ac40de5
·
verified ·
1 Parent(s): 54941d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -45
app.py CHANGED
@@ -62,6 +62,14 @@ elif selected_step == "Maintenance and Updates":
62
 
63
  # Section: Text Representation Methods
64
  st.header("Text Representation Methods")
 
 
 
 
 
 
 
 
65
 
66
  # Sample Texts
67
  texts = [
@@ -70,55 +78,59 @@ texts = [
70
  "The field of NLP is growing rapidly."
71
  ]
72
 
73
- # Bag of Words
74
- st.subheader("Bag of Words (BoW)")
75
- st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
76
- st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
77
- vectorizer = CountVectorizer()
78
- X_bow = vectorizer.fit_transform(texts)
79
- st.write("Feature Names:", vectorizer.get_feature_names_out())
80
- st.write("Bag of Words Representation:")
81
- st.write(X_bow.toarray())
82
 
83
- # TF-IDF
84
- st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
85
- st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
86
- st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
87
- tfidf_vectorizer = TfidfVectorizer()
88
- X_tfidf = tfidf_vectorizer.fit_transform(texts)
89
- st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
90
- st.write("TF-IDF Representation:")
91
- st.write(X_tfidf.toarray())
92
 
93
- # Word Embeddings
94
- st.subheader("Word Embeddings")
95
- st.write("Converts words into dense vectors capturing semantic relationships.")
96
- st.write("Common models include Word2Vec, GloVe, and FastText.")
97
- st.write("Example: 'king' - 'man' + 'woman' ≈ 'queen'.")
98
- model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
99
- word_vectors = model.wv
100
- st.write("Word2Vec Representation of 'Natural':")
101
- st.write(word_vectors['natural'])
 
 
 
 
102
 
103
- # n-grams
104
- st.subheader("n-grams")
105
- st.write("Captures sequences of n words to consider word order.")
106
- st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
107
- bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
108
- X_bigrams = bigram_vectorizer.fit_transform(texts)
109
- st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
110
- st.write("n-grams Representation:")
111
- st.write(X_bigrams.toarray())
112
 
113
- # One-Hot Encoding
114
- st.subheader("One-Hot Encoding")
115
- st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
116
- st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
117
- one_hot_vectorizer = CountVectorizer(binary=True)
118
- X_one_hot = one_hot_vectorizer.fit_transform(texts)
119
- st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
120
- st.write("One-Hot Encoding Representation:")
121
- st.write(X_one_hot.toarray())
122
 
123
  # Running the Streamlit app
124
  if __name__ == "__main__":
 
62
 
63
  # Section: Text Representation Methods
64
  st.header("Text Representation Methods")
65
+ methods = [
66
+ "Bag of Words",
67
+ "TF-IDF",
68
+ "Word Embeddings (Word2Vec)",
69
+ "n-grams",
70
+ "One-Hot Encoding"
71
+ ]
72
+ selected_method = st.radio("Select a text representation method:", methods)
73
 
74
  # Sample Texts
75
  texts = [
 
78
  "The field of NLP is growing rapidly."
79
  ]
80
 
81
+ if selected_method == "Bag of Words":
82
+ st.subheader("Bag of Words (BoW)")
83
+ st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
84
+ st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
85
+ vectorizer = CountVectorizer()
86
+ X_bow = vectorizer.fit_transform(texts)
87
+ st.write("Feature Names:", vectorizer.get_feature_names_out())
88
+ st.write("Bag of Words Representation:")
89
+ st.write(X_bow.toarray())
90
 
91
+ elif selected_method == "TF-IDF":
92
+ st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
93
+ st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
94
+ st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
95
+ tfidf_vectorizer = TfidfVectorizer()
96
+ X_tfidf = tfidf_vectorizer.fit_transform(texts)
97
+ st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
98
+ st.write("TF-IDF Representation:")
99
+ st.write(X_tfidf.toarray())
100
 
101
+ elif selected_method == "Word Embeddings (Word2Vec)":
102
+ st.subheader("Word Embeddings")
103
+ st.write("Converts words into dense vectors capturing semantic relationships.")
104
+ st.write("Common models include Word2Vec, GloVe, and FastText.")
105
+ st.write("Example: 'king' - 'man' + 'woman' ≈ 'queen'.")
106
+ model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
107
+ word_vectors = model.wv
108
+ word = 'natural'
109
+ if word in word_vectors:
110
+ st.write(f"Word2Vec Representation of '{word}':")
111
+ st.write(word_vectors[word])
112
+ else:
113
+ st.write(f"Word '{word}' not found in the vocabulary.")
114
 
115
+ elif selected_method == "n-grams":
116
+ st.subheader("n-grams")
117
+ st.write("Captures sequences of n words to consider word order.")
118
+ st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
119
+ bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
120
+ X_bigrams = bigram_vectorizer.fit_transform(texts)
121
+ st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
122
+ st.write("n-grams Representation:")
123
+ st.write(X_bigrams.toarray())
124
 
125
+ elif selected_method == "One-Hot Encoding":
126
+ st.subheader("One-Hot Encoding")
127
+ st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
128
+ st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
129
+ one_hot_vectorizer = CountVectorizer(binary=True)
130
+ X_one_hot = one_hot_vectorizer.fit_transform(texts)
131
+ st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
132
+ st.write("One-Hot Encoding Representation:")
133
+ st.write(X_one_hot.toarray())
134
 
135
  # Running the Streamlit app
136
  if __name__ == "__main__":