sree4411 commited on
Commit
e2d36f6
·
verified ·
1 Parent(s): a069442

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py CHANGED
@@ -1,4 +1,7 @@
1
  import streamlit as st
 
 
 
2
 
3
  # Title
4
  st.title("Introduction to NLP")
@@ -57,6 +60,66 @@ elif selected_step == "Maintenance and Updates":
57
  **Maintenance and Updates**: Continuously improving models with new data and feedback from users.
58
  """)
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # Running the Streamlit app
61
  if __name__ == "__main__":
62
  st.run()
 
1
  import streamlit as st
2
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
3
+ import numpy as np
4
+ from gensim.models import Word2Vec
5
 
6
  # Title
7
  st.title("Introduction to NLP")
 
60
  **Maintenance and Updates**: Continuously improving models with new data and feedback from users.
61
  """)
62
 
63
+ # Section: Text Representation Methods
64
+ st.header("Text Representation Methods")
65
+
66
+ # Sample Texts
67
+ texts = [
68
+ "Natural Language Processing is fascinating.",
69
+ "Natural Language Processing involves understanding human language.",
70
+ "The field of NLP is growing rapidly."
71
+ ]
72
+
73
+ # Bag of Words
74
+ st.subheader("Bag of Words (BoW)")
75
+ st.write("Represents text as a collection of word counts, ignoring grammar and word order.")
76
+ st.write("Example: The sentence 'I love NLP' becomes {'I': 1, 'love': 1, 'NLP': 1}.")
77
+ vectorizer = CountVectorizer()
78
+ X_bow = vectorizer.fit_transform(texts)
79
+ st.write("Feature Names:", vectorizer.get_feature_names_out())
80
+ st.write("Bag of Words Representation:")
81
+ st.write(X_bow.toarray())
82
+
83
+ # TF-IDF
84
+ st.subheader("Term Frequency-Inverse Document Frequency (TF-IDF)")
85
+ st.write("Enhances BoW by weighing words based on their frequency in a document and across all documents.")
86
+ st.write("Example: 'Machine learning is fun' might give different weights to 'machine,' 'learning,' and 'fun' based on their importance.")
87
+ tfidf_vectorizer = TfidfVectorizer()
88
+ X_tfidf = tfidf_vectorizer.fit_transform(texts)
89
+ st.write("Feature Names:", tfidf_vectorizer.get_feature_names_out())
90
+ st.write("TF-IDF Representation:")
91
+ st.write(X_tfidf.toarray())
92
+
93
+ # Word Embeddings
94
+ st.subheader("Word Embeddings")
95
+ st.write("Converts words into dense vectors capturing semantic relationships.")
96
+ st.write("Common models include Word2Vec, GloVe, and FastText.")
97
+ st.write("Example: 'king' - 'man' + 'woman' ≈ 'queen'.")
98
+ model = Word2Vec(sentences=[text.split() for text in texts], vector_size=100, window=5, min_count=1, workers=4)
99
+ word_vectors = model.wv
100
+ st.write("Word2Vec Representation of 'Natural':")
101
+ st.write(word_vectors['natural'])
102
+
103
+ # n-grams
104
+ st.subheader("n-grams")
105
+ st.write("Captures sequences of n words to consider word order.")
106
+ st.write("Example: For n=2 (bigrams), 'I love NLP' becomes ['I love', 'love NLP'].")
107
+ bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
108
+ X_bigrams = bigram_vectorizer.fit_transform(texts)
109
+ st.write("Feature Names:", bigram_vectorizer.get_feature_names_out())
110
+ st.write("n-grams Representation:")
111
+ st.write(X_bigrams.toarray())
112
+
113
+ # One-Hot Encoding
114
+ st.subheader("One-Hot Encoding")
115
+ st.write("Represents words as binary vectors where each word is a unique vector with a single high (1) value.")
116
+ st.write("Example: 'I love NLP' with a vocabulary of 4 words could be ['I': [1, 0, 0, 0], 'love': [0, 1, 0, 0], 'NLP': [0, 0, 1, 0]].")
117
+ one_hot_vectorizer = CountVectorizer(binary=True)
118
+ X_one_hot = one_hot_vectorizer.fit_transform(texts)
119
+ st.write("Feature Names:", one_hot_vectorizer.get_feature_names_out())
120
+ st.write("One-Hot Encoding Representation:")
121
+ st.write(X_one_hot.toarray())
122
+
123
  # Running the Streamlit app
124
  if __name__ == "__main__":
125
  st.run()