Spaces:

sree4411
/

NLP

Sleeping

App Files Files Community

sree4411 commited on Jan 27, 2025

Commit

ac40de5

verified ·

1 Parent(s): 54941d4

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -45

app.py CHANGED Viewed

@@ -62,6 +62,14 @@ elif selected_step == "Maintenance and Updates":
 # Section: Text Representation Methods
 st.header("Text Representation Methods")
 # Sample Texts
 texts = [
@@ -70,55 +78,59 @@ texts = [
     "The field of NLP is growing rapidly."
 ]
-# Bag of Words
-st.subheader("Bag of Words (BoW)")
-st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
-st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
-vectorizer = CountVectorizer()
-X_bow = vectorizer.fit_transform(texts)
-st.write("Feature Names:", vectorizer.get_feature_names_out())
-st.write("Bag of Words Representation:")
-st.write(X_bow.toarray())
-# TF-IDF
-st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
-st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
-st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
-tfidf_vectorizer = TfidfVectorizer()
-X_tfidf = tfidf_vectorizer.fit_transform(texts)
-st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
-st.write("TF-IDF Representation:")
-st.write(X_tfidf.toarray())
-# Word Embeddings
-st.subheader("Word Embeddings")
-st.write("Converts words into dense vectors capturing semantic relationships.")
-st.write("Common models include Word2Vec, GloVe, and FastText.")
-st.write("Example: 'king' - 'man' + 'woman' ≈ 'queen'.")
-model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
-word_vectors = model.wv
-st.write("Word2Vec Representation of 'Natural':")
-st.write(word_vectors['natural'])
-# n-grams
-st.subheader("n-grams")
-st.write("Captures sequences of n words to consider word order.")
-st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
-bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
-X_bigrams = bigram_vectorizer.fit_transform(texts)
-st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
-st.write("n-grams Representation:")
-st.write(X_bigrams.toarray())
-# One-Hot Encoding
-st.subheader("One-Hot Encoding")
-st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
-st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
-one_hot_vectorizer = CountVectorizer(binary=True)
-X_one_hot = one_hot_vectorizer.fit_transform(texts)
-st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
-st.write("One-Hot Encoding Representation:")
-st.write(X_one_hot.toarray())
 # Running the Streamlit app
 if __name__ == "__main__":

 # Section: Text Representation Methods
 st.header("Text Representation Methods")
+methods = [
+    "Bag of Words",
+    "TF-IDF",
+    "Word Embeddings (Word2Vec)",
+    "n-grams",
+    "One-Hot Encoding"
+]
+selected_method = st.radio("Select a text representation method:", methods)
 # Sample Texts
 texts = [
     "The field of NLP is growing rapidly."
 ]
+if selected_method == "Bag of Words":
+    st.subheader("Bag of Words (BoW)")
+    st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
+    st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
+    vectorizer = CountVectorizer()
+    X_bow = vectorizer.fit_transform(texts)
+    st.write("Feature Names:", vectorizer.get_feature_names_out())
+    st.write("Bag of Words Representation:")
+    st.write(X_bow.toarray())
+elif selected_method == "TF-IDF":
+    st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
+    st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
+    st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
+    tfidf_vectorizer = TfidfVectorizer()
+    X_tfidf = tfidf_vectorizer.fit_transform(texts)
+    st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
+    st.write("TF-IDF Representation:")
+    st.write(X_tfidf.toarray())
+elif selected_method == "Word Embeddings (Word2Vec)":
+    st.subheader("Word Embeddings")
+    st.write("Converts words into dense vectors capturing semantic relationships.")
+    st.write("Common models include Word2Vec, GloVe, and FastText.")
+    st.write("Example: 'king' - 'man' + 'woman' ≈ 'queen'.")
+    model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
+    word_vectors = model.wv
+    word = 'natural'
+    if word in word_vectors:
+        st.write(f"Word2Vec Representation of '{word}':")
+        st.write(word_vectors[word])
+    else:
+        st.write(f"Word '{word}' not found in the vocabulary.")
+elif selected_method == "n-grams":
+    st.subheader("n-grams")
+    st.write("Captures sequences of n words to consider word order.")
+    st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
+    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
+    X_bigrams = bigram_vectorizer.fit_transform(texts)
+    st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
+    st.write("n-grams Representation:")
+    st.write(X_bigrams.toarray())
+elif selected_method == "One-Hot Encoding":
+    st.subheader("One-Hot Encoding")
+    st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
+    st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
+    one_hot_vectorizer = CountVectorizer(binary=True)
+    X_one_hot = one_hot_vectorizer.fit_transform(texts)
+    st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
+    st.write("One-Hot Encoding Representation:")
+    st.write(X_one_hot.toarray())
 # Running the Streamlit app
 if __name__ == "__main__":