Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| #import re | |
| #import emoji | |
| st.markdown(""" | |
| <style> | |
| /* Set a soft background color */ | |
| body { | |
| background-color: #eef2f7; | |
| } | |
| /* Style for main title */ | |
| h1 { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 700; | |
| text-align: center; | |
| margin-bottom: 25px; | |
| } | |
| /* Style for headers */ | |
| h2 { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 600; | |
| margin-top: 30px; | |
| } | |
| /* Style for subheaders */ | |
| h3 { | |
| color: red; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 500; | |
| margin-top: 20px; | |
| } | |
| .custom-subheader { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 600; | |
| margin-bottom: 15px; | |
| } | |
| /* Paragraph styling */ | |
| p { | |
| font-family: 'Georgia', serif; | |
| line-height: 1.8; | |
| color: black; | |
| margin-bottom: 20px; | |
| } | |
| /* List styling with checkmark bullets */ | |
| .icon-bullet { | |
| list-style-type: none; | |
| padding-left: 20px; | |
| } | |
| .icon-bullet li { | |
| font-family: 'Georgia', serif; | |
| font-size: 1.1em; | |
| margin-bottom: 10px; | |
| color: black; | |
| } | |
| .icon-bullet li::before { | |
| content: "β"; | |
| padding-right: 10px; | |
| color: black; | |
| } | |
| /* Sidebar styling */ | |
| .sidebar .sidebar-content { | |
| background-color: #ffffff; | |
| border-radius: 10px; | |
| padding: 15px; | |
| } | |
| .sidebar h2 { | |
| color: #495057; | |
| } | |
| /* Custom button style */ | |
| .streamlit-button { | |
| background-color: #00FFFF; | |
| color: #000000; | |
| font-weight: bold; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.header(":red[π Simple EDA π¬]") | |
| # Introduction to Simple EDA | |
| st.markdown("<div class='section'>", unsafe_allow_html=True) | |
| st.markdown("<h2 class='title'>π Understanding Simple EDA</h2>", unsafe_allow_html=True) | |
| st.markdown("<p class='subtitle'>Evaluating raw text data quality before processing</p>", unsafe_allow_html=True) | |
| st.info("π **Simple EDA is a crucial step in the NLP lifecycle:**\n\nβ Ensures raw data quality\n\nβ Not dependent on problem statement\n\nβ Helps in better data exploration") | |
| st.markdown("</div>", unsafe_allow_html=True) | |
| st.subheader(":violet[π Major Simple EDA Steps]") | |
| st.markdown("β **Check Text Case** β Identify if text is in **lowercase, uppercase, or mixed case**.") | |
| st.markdown("β **Detect HTML & URL Tags** β Analyze if text contains unwanted elements.") | |
| st.markdown("β **Identify URLs** β Ensure URLs are either preserved or removed based on problem statement.") | |
| st.markdown("β **Detect Mentions & Hashtags** β Find occurrences of `@mentions` or `#hashtags`.") | |
| st.markdown("β **Identify Numeric Data** β Detect if text includes **digits or numerical data**.") | |
| st.markdown("β **Analyze Punctuation Usage** β Check whether punctuation marks affect text clarity.") | |
| st.markdown("β **Detect Emojis** β Ensure **emoji-based sentiments** are not lost.") | |
| st.markdown("β **Analyze Date/Time Formats** β Identify the presence of date/time-related text.") | |
| st.success("π Performing **Simple EDA** ensures structured and high-quality text data, leading to better NLP model performance!") | |
| st.code(''' | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import emoji | |
| def simple_eda(data, column): | |
| lower_upper = data[column].apply(lambda x: True if (x.lower()) or (x.upper()) else False).sum() | |
| tags = data[column].apply(lambda x: True if re.search("<.*?>", x) else False).sum() | |
| urls = data[column].apply(lambda x: True if re.search("https://\\S+", x) else False).sum() | |
| mails = data[column].apply(lambda x: True if re.search("\\S+@\\S+", x) else False).sum() | |
| mentions = data[column].apply(lambda x: True if re.search("\\B[@#]\\S+", x) else False).sum() | |
| emojis = data[column].apply(lambda x: True if emoji.emoji_count(x) else False).sum() | |
| digit = data[column].apply(lambda x: True if re.search("\\d", x) else False).sum() | |
| punc = data[column].apply(lambda x: True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~]', x) else False).sum() | |
| dates = data[column].apply(lambda x: True if re.search(r"^[0-9]{1,2}/[0-9]{1,2}/[0-9]{4}$", x) else False).sum() | |
| if lower_upper > 0: | |
| print("Text has a combination of cases.") | |
| if tags > 0: | |
| print("Text contains HTML tags.") | |
| if urls > 0: | |
| print("Text contains URLs.") | |
| if mails > 0: | |
| print("Text contains email addresses.") | |
| if mentions > 0: | |
| print("Text contains mentions or hashtags.") | |
| if emojis > 0: | |
| print("Text contains emojis.") | |
| if digit > 0: | |
| print("Text contains digits.") | |
| if punc > 0: | |
| print("Text contains punctuation marks.") | |
| if dates > 0: | |
| print("Text contains dates.") | |
| ''') | |
| st.markdown(''' | |
| - By following this code, we will check the exploration of the data. | |
| - It essentially gives the quality of the collected text data. | |
| - After the simple EDA, we will perform pre-processing on text based on the problem statement after knowing the quality of the data. | |
| ''') | |