Spaces:

sree4411
/

NLP

Sleeping

App Files Files Community

sree4411 commited on Jan 27, 2025

Commit

e2d36f6

verified ·

1 Parent(s): a069442

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -0

app.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import streamlit as st
 # Title
 st.title("Introduction to NLP")
@@ -57,6 +60,66 @@ elif selected_step == "Maintenance and Updates":
     **Maintenance and Updates**: Continuously improving models with new data and feedback from users.
     """)
 # Running the Streamlit app
 if __name__ == "__main__":
     st.run()

 import streamlit as st
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+import numpy as np
+from gensim.models import Word2Vec
 # Title
 st.title("Introduction to NLP")
     **Maintenance and Updates**: Continuously improving models with new data and feedback from users.
     """)
+# Section: Text Representation Methods
+st.header("Text Representation Methods")
+# Sample Texts
+texts = [
+    "Natural Language Processing is fascinating.",
+    "Natural Language Processing involves understanding human language.",
+    "The field of NLP is growing rapidly."
+]
+# Bag of Words
+st.subheader("Bag of Words (BoW)")
+st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
+st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
+vectorizer = CountVectorizer()
+X_bow = vectorizer.fit_transform(texts)
+st.write("Feature Names:", vectorizer.get_feature_names_out())
+st.write("Bag of Words Representation:")
+st.write(X_bow.toarray())
+# TF-IDF
+st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
+st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
+st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
+tfidf_vectorizer = TfidfVectorizer()
+X_tfidf = tfidf_vectorizer.fit_transform(texts)
+st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
+st.write("TF-IDF Representation:")
+st.write(X_tfidf.toarray())
+# Word Embeddings
+st.subheader("Word Embeddings")
+st.write("Converts words into dense vectors capturing semantic relationships.")
+st.write("Common models include Word2Vec, GloVe, and FastText.")
+st.write("Example: 'king' - 'man' + 'woman' ≈ 'queen'.")
+model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
+word_vectors = model.wv
+st.write("Word2Vec Representation of 'Natural':")
+st.write(word_vectors['natural'])
+# n-grams
+st.subheader("n-grams")
+st.write("Captures sequences of n words to consider word order.")
+st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
+bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
+X_bigrams = bigram_vectorizer.fit_transform(texts)
+st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
+st.write("n-grams Representation:")
+st.write(X_bigrams.toarray())
+# One-Hot Encoding
+st.subheader("One-Hot Encoding")
+st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
+st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
+one_hot_vectorizer = CountVectorizer(binary=True)
+X_one_hot = one_hot_vectorizer.fit_transform(texts)
+st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
+st.write("One-Hot Encoding Representation:")
+st.write(X_one_hot.toarray())
 # Running the Streamlit app
 if __name__ == "__main__":
     st.run()