Spaces:

satya11
/

Natural_Language_Processing

Sleeping

App Files Files Community

satya11 commited on Apr 7, 2025

Commit

a539a5e

verified ·

1 Parent(s): e51b653

Update pages/4.Simple EDA.py

Browse files

Files changed (1) hide show

pages/4.Simple EDA.py +139 -200

pages/4.Simple EDA.py CHANGED Viewed

@@ -1,262 +1,201 @@
 import streamlit as st
 import pandas as pd
-import re
-import emoji
-from io import StringIO
 st.markdown("""
     <style>
-    /* Set a soft background color */
     body {
-        background-color: #eef2f7;
     }
-    /* Style for main title */
     h1 {
-        color: black;
-        font-family: 'Roboto', sans-serif;
         font-weight: 700;
         text-align: center;
         margin-bottom: 25px;
     }
-    /* Style for headers */
     h2 {
-        color: black;
-        font-family: 'Roboto', sans-serif;
         font-weight: 600;
         margin-top: 30px;
     }
-    /* Style for subheaders */
-     h3 {
-        color: red;
-        font-family: 'Roboto', sans-serif;
         font-weight: 500;
         margin-top: 20px;
     }
-    .custom-subheader {
-        color: black;
-        font-family: 'Roboto', sans-serif;
-        font-weight: 600;
-        margin-bottom: 15px;
-    }
-    /* Paragraph styling */
-    p {
         font-family: 'Georgia', serif;
         line-height: 1.8;
-        color: black;
         margin-bottom: 20px;
     }
-    /* List styling with checkmark bullets */
-    .icon-bullet {
         list-style-type: none;
         padding-left: 20px;
     }
-    .icon-bullet li {
         font-family: 'Georgia', serif;
         font-size: 1.1em;
         margin-bottom: 10px;
-        color: black;
     }
-    .icon-bullet li::before {
-        content: "◆";
-        padding-right: 10px;
-        color: black;
     }
     /* Sidebar styling */
     .sidebar .sidebar-content {
         background-color: #ffffff;
-        border-radius: 10px;
         padding: 15px;
     }
-    .sidebar h2 {
-        color: #495057;
     }
-    /* Custom button style */
-    .streamlit-button {
-        background-color: #00FFFF;
-        color: #000000;
-        font-weight: bold;
     }
-    .eda-result {
-        background-color: #f8f9fa;
-        border-radius: 5px;
         padding: 15px;
-        margin: 10px 0;
-        border-left: 4px solid #6c757d;
     }
     </style>
     """, unsafe_allow_html=True)
-st.header(":red[📊 Advanced Text EDA Tool 💬]")
-# Introduction to Simple EDA
-st.markdown("<div class='section'>", unsafe_allow_html=True)
-st.markdown("<h2 class='title'>🔍 Comprehensive Text Analysis</h2>", unsafe_allow_html=True)
-st.markdown("<p class='subtitle'>Evaluate raw text data quality with detailed metrics</p>", unsafe_allow_html=True)
-st.info("""
-📌 **Key Benefits of Text EDA:**
-- Ensures raw data quality before processing
-- Identifies text patterns and special characters
-- Helps determine necessary preprocessing steps
-- Not dependent on specific problem statements
-""")
-st.markdown("</div>", unsafe_allow_html=True)
-# File upload section
-st.subheader(":violet[📂 Upload Your Data]")
-uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-if uploaded_file is not None:
-    # Read the uploaded file
-    df = pd.read_csv(uploaded_file)
-    # Show dataframe
-    st.subheader("📊 Data Preview")
-    st.dataframe(df.head())
-    # Select text column
-    text_column = st.selectbox("Select the text column to analyze", df.columns)
-    # Analysis parameters
-    st.subheader("⚙️ Analysis Parameters")
-    sample_size = st.slider("Sample size (0 for full dataset)", 0, len(df), min(500, len(df)))
-    analyze_button = st.button("Run Text Analysis", type="primary")
-    if analyze_button:
-        st.subheader("📈 Analysis Results")
-        # Get sample if requested
-        if sample_size > 0:
-            df_sample = df.sample(min(sample_size, len(df)))
-        else:
-            df_sample = df.copy()
-        # Define analysis functions
-        def has_mixed_case(text):
-            return not (text.islower() or text.isupper())
-        def has_html_tags(text):
-            return bool(re.search("<.*?>", str(text)))
-        def has_urls(text):
-            return bool(re.search("https?://\S+|www\.\S+", str(text)))
-        def has_emails(text):
-            return bool(re.search("\S+@\S+", str(text)))
-        def has_mentions(text):
-            return bool(re.search("\B[@#]\S+", str(text)))
-        def has_emojis(text):
-            return emoji.emoji_count(str(text)) > 0
-        def has_digits(text):
-            return bool(re.search("\d", str(text)))
-        def has_punctuation(text):
-            return bool(re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', str(text)))
-        def has_dates(text):
-            return bool(re.search(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b", str(text)))
-        # Calculate metrics
-        results = {
-            "Mixed Case": df_sample[text_column].apply(has_mixed_case).sum(),
-            "HTML Tags": df_sample[text_column].apply(has_html_tags).sum(),
-            "URLs": df_sample[text_column].apply(has_urls).sum(),
-            "Email Addresses": df_sample[text_column].apply(has_emails).sum(),
-            "Mentions/Hashtags": df_sample[text_column].apply(has_mentions).sum(),
-            "Emojis": df_sample[text_column].apply(has_emojis).sum(),
-            "Digits": df_sample[text_column].apply(has_digits).sum(),
-            "Punctuation": df_sample[text_column].apply(has_punctuation).sum(),
-            "Date Formats": df_sample[text_column].apply(has_dates).sum()
-        }
-        # Display results
-        total_texts = len(df_sample)
-        for feature, count in results.items():
-            percentage = (count / total_texts) * 100
-            st.markdown(f"""
-            <div class="eda-result">
-                <h4>{feature}</h4>
-                <p><strong>{count}</strong> texts contain this feature ({percentage:.1f}% of sample)</p>
-            </div>
-            """, unsafe_allow_html=True)
-        # Show sample examples
-        st.subheader("🔍 Sample Examples")
-        for feature, count in results.items():
-            if count > 0:
-                st.write(f"**Examples with {feature}:**")
-                examples = df_sample[df_sample[text_column].apply(locals()[f"has_{feature.lower().replace(' ', '_').replace('/', '_')}"])][text_column].head(3).tolist()
-                for example in examples:
-                    st.code(example, language='text')
-                st.write("")
-else:
-    st.subheader(":violet[📃 Text Analysis Features]")
-    st.markdown("""
-    ✅ **Check Text Case** – Identify if text is in lowercase, uppercase, or mixed case
-    ✅ **Detect HTML & URL Tags** – Analyze if text contains unwanted elements
-    ✅ **Identify URLs** – Find web links in the text
-    ✅ **Detect Email Addresses** – Locate email patterns
-    ✅ **Find Mentions & Hashtags** – Identify @mentions or #hashtags
-    ✅ **Analyze Emoji Usage** – Count emoji occurrences
-    ✅ **Identify Numeric Data** – Detect digits or numerical data
-    ✅ **Check Punctuation** – Analyze punctuation usage
-    ✅ **Find Date Formats** – Identify date/time patterns
-    """)
-    st.success("🚀 Upload a CSV file to begin your text analysis!")
-# Code display section
-st.subheader(":violet[💻 Analysis Code]")
 st.code('''
-def text_analysis(data, text_column):
-    """Comprehensive text analysis function"""
     results = {}
-    # Case analysis
-    results['mixed_case'] = data[text_column].apply(
-        lambda x: not (str(x).islower() or str(x).isupper())
-    ).sum()
-    # Special patterns
-    patterns = {
-        'html_tags': r"<.*?>",
-        'urls': r"https?://\S+|www\.\S+",
-        'emails': r"\S+@\S+",
-        'mentions': r"\B[@#]\S+",
-        'digits': r"\d",
-        'punctuation': r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',
-        'dates': r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b"
-    }
-    for name, pattern in patterns.items():
-        results[name] = data[text_column].apply(
-            lambda x: bool(re.search(pattern, str(x)))
-        ).sum()
-    # Emoji analysis
-    results['emojis'] = data[text_column].apply(
-        lambda x: emoji.emoji_count(str(x)) > 0
-    ).sum()
-    return results
 ''', language='python')
 st.markdown("""
-### How to Use This Analysis:
-1. **Upload** your CSV file containing text data
-2. **Select** the text column to analyze
-3. **Choose** a sample size (or use full dataset)
-4. **Run** the analysis to get detailed metrics
-5. **Review** the results to determine necessary preprocessing steps
-""")

 import streamlit as st
 import pandas as pd
 st.markdown("""
     <style>
+    /* Main background and font settings */
     body {
+        background-color: #f8f9fa;
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
     }
+    /* Main title styling */
     h1 {
+        color: #2c3e50;
+        font-family: 'Arial', sans-serif;
         font-weight: 700;
         text-align: center;
         margin-bottom: 25px;
+        border-bottom: 2px solid #3498db;
+        padding-bottom: 10px;
     }
+    /* Header styling */
     h2 {
+        color: #2c3e50;
+        font-family: 'Arial', sans-serif;
         font-weight: 600;
         margin-top: 30px;
+        border-left: 4px solid #3498db;
+        padding-left: 10px;
     }
+    /* Subheader styling */
+    h3 {
+        color: #2c3e50;
+        font-family: 'Arial', sans-serif;
         font-weight: 500;
         margin-top: 20px;
     }
+    /* Custom text styling */
+    .custom-text {
         font-family: 'Georgia', serif;
         line-height: 1.8;
+        color: #34495e;
         margin-bottom: 20px;
     }
+    /* List styling */
+    .custom-list {
         list-style-type: none;
         padding-left: 20px;
     }
+    .custom-list li {
         font-family: 'Georgia', serif;
         font-size: 1.1em;
         margin-bottom: 10px;
+        color: #34495e;
+        position: relative;
+        padding-left: 25px;
     }
+    .custom-list li::before {
+        content: "•";
+        color: #3498db;
+        font-weight: bold;
+        position: absolute;
+        left: 0;
     }
     /* Sidebar styling */
     .sidebar .sidebar-content {
         background-color: #ffffff;
+        border-radius: 8px;
         padding: 15px;
+        box-shadow: 0 2px 5px rgba(0,0,0,0.1);
     }
+    /* Info box styling */
+    .stInfo {
+        background-color: #e8f4fc;
+        border-left: 4px solid #3498db;
+        padding: 15px;
+        border-radius: 0 4px 4px 0;
     }
+    /* Success box styling */
+    .stSuccess {
+        background-color: #e8f8f0;
+        border-left: 4px solid #2ecc71;
+        padding: 15px;
+        border-radius: 0 4px 4px 0;
     }
+    /* Code block styling */
+    .stCodeBlock {
+        background-color: #f5f5f5;
+        border-radius: 4px;
         padding: 15px;
+        border-left: 4px solid #95a5a6;
     }
     </style>
     """, unsafe_allow_html=True)
+st.title("Text Data Quality Analysis")
+# Introduction section
+st.markdown("""
+    <div class='custom-text'>
+    <h2>Understanding Text Data Quality Analysis</h2>
+    <p>Evaluating raw text data quality before processing is a critical first step in any text analysis project.</p>
+    </div>
+    """, unsafe_allow_html=True)
+st.markdown("""
+    <div class='stInfo'>
+    <strong>Text Data Quality Analysis is crucial because:</strong><br><br>
+    • Ensures raw data quality before processing<br>
+    • Helps identify potential issues early in the pipeline<br>
+    • Provides insights for better data exploration<br>
+    • Is independent of the specific problem statement
+    </div>
+    """, unsafe_allow_html=True)
+# Main analysis steps
+st.markdown("""
+    <div class='custom-text'>
+    <h2>Key Text Data Quality Checks</h2>
+    </div>
+    """, unsafe_allow_html=True)
+st.markdown("""
+    <ul class='custom-list'>
+    <li><strong>Check Text Case</strong> – Identify if text is in lowercase, uppercase, or mixed case</li>
+    <li><strong>Detect HTML Tags</strong> – Analyze if text contains unwanted HTML elements</li>
+    <li><strong>Identify URLs</strong> – Check for web addresses that may need processing</li>
+    <li><strong>Detect Mentions & Hashtags</strong> – Find occurrences of @mentions or #hashtags</li>
+    <li><strong>Identify Numeric Data</strong> – Detect if text includes digits or numerical data</li>
+    <li><strong>Analyze Punctuation Usage</strong> – Check whether punctuation marks affect text clarity</li>
+    <li><strong>Analyze Date/Time Formats</strong> – Identify the presence of date/time-related text</li>
+    </ul>
+    """, unsafe_allow_html=True)
+st.markdown("""
+    <div class='stSuccess'>
+    Performing thorough text data quality analysis ensures structured and high-quality text data, leading to better analysis and model performance.
+    </div>
+    """, unsafe_allow_html=True)
+# Code example
+st.markdown("""
+    <div class='custom-text'>
+    <h2>Implementation Example</h2>
+    <p>Here's a Python function to perform basic text data quality checks:</p>
+    </div>
+    """, unsafe_allow_html=True)
 st.code('''
+import pandas as pd
+import re
+def text_quality_analysis(data, column):
+    # Initialize results dictionary
     results = {}
+    # Check for case variations
+    results['has_lowercase'] = data[column].str.contains('[a-z]').sum()
+    results['has_uppercase'] = data[column].str.contains('[A-Z]').sum()
+    # Check for HTML tags
+    results['has_html_tags'] = data[column].str.contains('<.*?>', regex=True).sum()
+    # Check for URLs
+    results['has_urls'] = data[column].str.contains('https?://\\S+', regex=True).sum()
+    # Check for email addresses
+    results['has_emails'] = data[column].str.contains('\\S+@\\S+', regex=True).sum()
+    # Check for mentions and hashtags
+    results['has_mentions'] = data[column].str.contains('@\\w+', regex=True).sum()
+    results['has_hashtags'] = data[column].str.contains('#\\w+', regex=True).sum()
+    # Check for digits
+    results['has_digits'] = data[column].str.contains('\\d', regex=True).sum()
+    # Check for punctuation
+    results['has_punctuation'] = data[column].str.contains('[!"#$%&\\\'()*+,-./:;<=>?@[\\\\]^_`{|}~]', regex=True).sum()
+    # Check for date formats (simple check)
+    results['has_dates'] = data[column].str.contains('\\d{1,2}/\\d{1,2}/\\d{2,4}', regex=True).sum()
+    return pd.DataFrame.from_dict(results, orient='index', columns=['Count'])
 ''', language='python')
 st.markdown("""
+    <div class='custom-text'>
+    <p>This function provides a comprehensive analysis of text data quality by checking for various common elements that might need special handling during preprocessing.</p>
+    <p>The results can help guide your data cleaning strategy based on the specific characteristics of your text data.</p>
+    </div>
+    """, unsafe_allow_html=True)