Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| st.markdown(""" | |
| <style> | |
| /* Set a soft background color */ | |
| body { | |
| background-color: #eef2f7; | |
| } | |
| /* Style for main title */ | |
| h1 { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 700; | |
| text-align: center; | |
| margin-bottom: 25px; | |
| } | |
| /* Style for headers */ | |
| h2 { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 600; | |
| margin-top: 30px; | |
| } | |
| /* Style for subheaders */ | |
| h3 { | |
| color: red; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 500; | |
| margin-top: 20px; | |
| } | |
| .custom-subheader { | |
| color: black; | |
| font-family: 'Roboto', sans-serif; | |
| font-weight: 600; | |
| margin-bottom: 15px; | |
| } | |
| /* Paragraph styling */ | |
| p { | |
| font-family: 'Georgia', serif; | |
| line-height: 1.8; | |
| color: black; | |
| margin-bottom: 20px; | |
| } | |
| /* List styling with checkmark bullets */ | |
| .icon-bullet { | |
| list-style-type: none; | |
| padding-left: 20px; | |
| } | |
| .icon-bullet li { | |
| font-family: 'Georgia', serif; | |
| font-size: 1.1em; | |
| margin-bottom: 10px; | |
| color: black; | |
| } | |
| .icon-bullet li::before { | |
| content: "β"; | |
| padding-right: 10px; | |
| color: black; | |
| } | |
| /* Sidebar styling */ | |
| .sidebar .sidebar-content { | |
| background-color: #ffffff; | |
| border-radius: 10px; | |
| padding: 15px; | |
| } | |
| .sidebar h2 { | |
| color: #495057; | |
| } | |
| /* Custom button style */ | |
| .streamlit-button { | |
| background-color: #00FFFF; | |
| color: #000000; | |
| font-weight: bold; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.header(":red[π Simple EDA π¬]") | |
| # Introduction to Simple EDA | |
| st.markdown("<div class='section'>", unsafe_allow_html=True) | |
| st.markdown("<h2 class='title'>π Understanding Simple EDA</h2>", unsafe_allow_html=True) | |
| st.markdown("<p class='subtitle'>Evaluating raw text data quality before processing</p>", unsafe_allow_html=True) | |
| st.info("π **Simple EDA is a crucial step in the NLP lifecycle:**\n\nβ Ensures raw data quality\n\nβ Not dependent on problem statement\n\nβ Helps in better data exploration") | |
| st.markdown("</div>", unsafe_allow_html=True) | |
| st.subheader(":violet[π Major Simple EDA Steps]") | |
| st.markdown("β **Check Text Case** β Identify if text is in **lowercase, uppercase, or mixed case**.") | |
| st.markdown("β **Detect HTML & URL Tags** β Analyze if text contains unwanted elements.") | |
| st.markdown("β **Identify URLs** β Ensure URLs are either preserved or removed based on problem statement.") | |
| st.markdown("β **Detect Mentions & Hashtags** β Find occurrences of `@mentions` or `#hashtags`.") | |
| st.markdown("β **Identify Numeric Data** β Detect if text includes **digits or numerical data**.") | |
| st.markdown("β **Analyze Punctuation Usage** β Check whether punctuation marks affect text clarity.") | |
| st.markdown("β **Detect Emojis** β Ensure **emoji-based sentiments** are not lost.") | |
| st.markdown("β **Analyze Date/Time Formats** β Identify the presence of date/time-related text.") | |
| st.success("π Performing **Simple EDA** ensures structured and high-quality text data, leading to better NLP model performance!") | |
| st.code(''' | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import emoji | |
| def simple_eda(data,column): | |
| lower_upper = data[column].apply(lambda x:True if (x.lower()) or (x.upper()) else False).sum() | |
| tags = data[column].apply(lambda x:True if re.search("<.*?>",x) else False).sum() | |
| urls = data[column].apply(lambda x:True if re.search("https://\S+",x) else False).sum() | |
| mails = data[column].apply(lambda x:True if re.search("\S+@\S+",x) else False).sum() | |
| mentions = data[column].apply(lambda x:True if re.search("\B[@#]\S+",x) else False).sum() | |
| emojis = data[column].apply(lambda x:True if emoji.emoji_count(x) else False).sum() | |
| digit = data[column].apply(lambda x:True if re.search("\d",x) else False).sum() | |
| punc = data[column].apply(lambda x:True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x) else False).sum() | |
| dates = data[column].apply(lambda x:True if re.search(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$",x) else False).sum() | |
| if lower_upper >0: | |
| print("text have combination") | |
| if tags > 0: | |
| print("text have tags") | |
| if urls >0: | |
| print("text have urls") | |
| if mails > 0: | |
| print("text have mails") | |
| if mentions >0: | |
| print("text have mentions") | |
| if emojis > 0: | |
| print("text have emojis") | |
| if digit >0: | |
| print("text have digit") | |
| if punc > 0: | |
| print("text have punctuations") | |
| if dates >0: | |
| print("text have dates") | |
| ''') | |
| st.markdown(''' | |
| - By the following code we will check the exploration of the data | |
| - Basically it gives the quality of collected text data | |
| - After the simple eda we will perform pre-processing on text based on problem statement after knowing quality of the data | |
| ''') |