UmaKumpatla commited on
Commit
0d2080f
·
verified ·
1 Parent(s): d82093c

Update pages/2.Terminology.py

Browse files
Files changed (1) hide show
  1. pages/2.Terminology.py +45 -0
pages/2.Terminology.py CHANGED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # Title
4
+ st.title(":blue[🔍 NLP Terminologies]")
5
+
6
+ # Helper function to display sections
7
+ def display_section(title, description, example=None, extra=None):
8
+ st.subheader(f":green[{title}]")
9
+ st.write(description)
10
+ if example:
11
+ st.write(":red[Example]")
12
+ st.write(example)
13
+ if extra:
14
+ st.write(extra)
15
+
16
+ # NLP Terminologies
17
+ display_section("Corpus", "A collection of documents grouped together.",
18
+ "A corpus of English literature might include works by Shakespeare, Dickens, and Austen.")
19
+
20
+ display_section("Document", "A collection of sentences, paragraphs, single words, or single characters.",
21
+ "An article, a book, or an email can be considered a document.")
22
+
23
+ display_section("Paragraph", "A collection of sentences.",
24
+ "The quick brown fox jumps over the lazy dog. It was a sunny day. The fox was happy.")
25
+
26
+ display_section("Sentence", "A collection of words.",
27
+ "The quick brown fox jumps over the lazy dog.")
28
+
29
+ display_section("Word", "A collection of characters.",
30
+ "Fox is a word made up of the characters 'F', 'o', and 'x'.")
31
+
32
+ display_section("Characters", "Can be numbers, alphabets, or special symbols.",
33
+ "'A', '1', and '@' are all characters.")
34
+
35
+ display_section("Tokenization", "Tokenization is the process of breaking down text into smaller units called tokens.",
36
+ "Sentence tokenization splits text into sentences, while word tokenization splits text into words.",
37
+ ":blue[Types of Tokenization]\n- **Sentence Tokenization**: Splits text into individual sentences.\n- **Word Tokenization**: Splits text into individual words.\n- **Character Tokenization**: Splits text into individual characters.")
38
+
39
+ display_section("Stop Words", "Stop words are common words that do not contribute much to the meaning of a sentence and are often removed during text processing.",
40
+ "Words like the, we, in, am, she, and he are considered stop words.")
41
+
42
+ display_section("Vectorization", "Vectorization converts text data into a numerical format for machine learning models.",
43
+ None,
44
+ ":blue[Types of Vectorization]\n- **One-Hot Encoding**: Represents words as binary vectors.\n- **Bag-of-Words**: Counts word occurrences, disregarding grammar.\n- **TF-IDF**: Balances word frequency in a document vs. the entire corpus.\n- **Word2Vec**: Deep learning-based word embeddings.\n- **GloVe**: Uses word co-occurrence matrices.\n- **FastText**: Considers subwords for rare/misspelled words.")
45
+