Update pages/4_Simple_EDA.py
Browse files- pages/4_Simple_EDA.py +23 -20
pages/4_Simple_EDA.py
CHANGED
|
@@ -77,27 +77,30 @@ st.markdown("""
|
|
| 77 |
</style>
|
| 78 |
""", unsafe_allow_html=True)
|
| 79 |
|
| 80 |
-
st.header(":red[Simple EDAπ¬]")
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
st.subheader(":violet[Major Simple EDAπ]")
|
| 88 |
-
st.markdown('''
|
| 89 |
-
- Whether all the alphabets are in
|
| 90 |
-
- lower case
|
| 91 |
-
- upper case
|
| 92 |
-
- combination of lower and upper case
|
| 93 |
-
- Whether the collected text data contains any html / url tags
|
| 94 |
-
- Whether the collected text data contains any urls
|
| 95 |
-
- Whether the collected text data contains any mentions / hashtags
|
| 96 |
-
- Whether the collected text data contains any digits
|
| 97 |
-
- Whether the collected text data contains any punctuations
|
| 98 |
-
- Whether the collected text data contains any emojis
|
| 99 |
-
- Whether the collected text data contains any data /time
|
| 100 |
-
''')
|
| 101 |
|
| 102 |
st.code('''
|
| 103 |
import pandas as pd
|
|
|
|
| 77 |
</style>
|
| 78 |
""", unsafe_allow_html=True)
|
| 79 |
|
| 80 |
+
st.header(":red[π Simple EDA π¬]")
|
| 81 |
+
|
| 82 |
+
# Introduction to Simple EDA
|
| 83 |
+
st.markdown("<div class='section'>", unsafe_allow_html=True)
|
| 84 |
+
st.markdown("<h2 class='title'>π Understanding Simple EDA</h2>", unsafe_allow_html=True)
|
| 85 |
+
st.markdown("<p class='subtitle'>Evaluating raw text data quality before processing</p>", unsafe_allow_html=True)
|
| 86 |
+
|
| 87 |
+
st.info("π **Simple EDA is a crucial step in the NLP lifecycle:**\n\nβ
Ensures raw data quality\n\nβ
Not dependent on problem statement\n\nβ
Helps in better data exploration")
|
| 88 |
+
|
| 89 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
| 90 |
+
|
| 91 |
+
st.subheader(":violet[π Major Simple EDA Steps]")
|
| 92 |
+
|
| 93 |
+
st.markdown("β
**Check Text Case** β Identify if text is in **lowercase, uppercase, or mixed case**.")
|
| 94 |
+
st.markdown("β
**Detect HTML & URL Tags** β Analyze if text contains unwanted elements.")
|
| 95 |
+
st.markdown("β
**Identify URLs** β Ensure URLs are either preserved or removed based on problem statement.")
|
| 96 |
+
st.markdown("β
**Detect Mentions & Hashtags** β Find occurrences of `@mentions` or `#hashtags`.")
|
| 97 |
+
st.markdown("β
**Identify Numeric Data** β Detect if text includes **digits or numerical data**.")
|
| 98 |
+
st.markdown("β
**Analyze Punctuation Usage** β Check whether punctuation marks affect text clarity.")
|
| 99 |
+
st.markdown("β
**Detect Emojis** β Ensure **emoji-based sentiments** are not lost.")
|
| 100 |
+
st.markdown("β
**Analyze Date/Time Formats** β Identify the presence of date/time-related text.")
|
| 101 |
+
|
| 102 |
+
st.success("π Performing **Simple EDA** ensures structured and high-quality text data, leading to better NLP model performance!")
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
st.code('''
|
| 106 |
import pandas as pd
|