Spaces:

satya11
/

Natural_Language_Processing

Sleeping

App Files Files Community

satya11 commited on Apr 7, 2025

Commit

66617ac

verified ·

1 Parent(s): a6e7558

Create 4.Simple_EDA.py

Browse files

Files changed (1) hide show

pages/4.Simple_EDA.py +263 -0

pages/4.Simple_EDA.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import streamlit as st
+import pandas as pd
+import re
+import emoji
+from io import StringIO
+st.markdown("""
+    <style>
+    /* Set a soft background color */
+    body {
+        background-color: #eef2f7;
+    }
+    /* Style for main title */
+    h1 {
+        color: black;
+        font-family: 'Roboto', sans-serif;
+        font-weight: 700;
+        text-align: center;
+        margin-bottom: 25px;
+    }
+    /* Style for headers */
+    h2 {
+        color: black;
+        font-family: 'Roboto', sans-serif;
+        font-weight: 600;
+        margin-top: 30px;
+    }
+    /* Style for subheaders */
+     h3 {
+        color: red;
+        font-family: 'Roboto', sans-serif;
+        font-weight: 500;
+        margin-top: 20px;
+    }
+    .custom-subheader {
+        color: black;
+        font-family: 'Roboto', sans-serif;
+        font-weight: 600;
+        margin-bottom: 15px;
+    }
+    /* Paragraph styling */
+    p {
+        font-family: 'Georgia', serif;
+        line-height: 1.8;
+        color: black;
+        margin-bottom: 20px;
+    }
+    /* List styling with checkmark bullets */
+    .icon-bullet {
+        list-style-type: none;
+        padding-left: 20px;
+    }
+    .icon-bullet li {
+        font-family: 'Georgia', serif;
+        font-size: 1.1em;
+        margin-bottom: 10px;
+        color: black;
+    }
+    .icon-bullet li::before {
+        content: "◆";
+        padding-right: 10px;
+        color: black;
+    }
+    /* Sidebar styling */
+    .sidebar .sidebar-content {
+        background-color: #ffffff;
+        border-radius: 10px;
+        padding: 15px;
+    }
+    .sidebar h2 {
+        color: #495057;
+    }
+    /* Custom button style */
+    .streamlit-button {
+        background-color: #00FFFF;
+        color: #000000;
+        font-weight: bold;
+    }
+    .eda-result {
+        background-color: #f8f9fa;
+        border-radius: 5px;
+        padding: 15px;
+        margin: 10px 0;
+        border-left: 4px solid #6c757d;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+st.header(":red[📊 Advanced Text EDA Tool 💬]")
+# Introduction to Simple EDA
+st.markdown("<div class='section'>", unsafe_allow_html=True)
+st.markdown("<h2 class='title'>🔍 Comprehensive Text Analysis</h2>", unsafe_allow_html=True)
+st.markdown("<p class='subtitle'>Evaluate raw text data quality with detailed metrics</p>", unsafe_allow_html=True)
+st.info("""
+📌 **Key Benefits of Text EDA:**
+- Ensures raw data quality before processing
+- Identifies text patterns and special characters
+- Helps determine necessary preprocessing steps
+- Not dependent on specific problem statements
+""")
+st.markdown("</div>", unsafe_allow_html=True)
+# File upload section
+st.subheader(":violet[📂 Upload Your Data]")
+uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+if uploaded_file is not None:
+    # Read the uploaded file
+    df = pd.read_csv(uploaded_file)
+    # Show dataframe
+    st.subheader("📊 Data Preview")
+    st.dataframe(df.head())
+    # Select text column
+    text_column = st.selectbox("Select the text column to analyze", df.columns)
+    # Analysis parameters
+    st.subheader("⚙️ Analysis Parameters")
+    sample_size = st.slider("Sample size (0 for full dataset)", 0, len(df), min(500, len(df)))
+    analyze_button = st.button("Run Text Analysis", type="primary")
+    if analyze_button:
+        st.subheader("📈 Analysis Results")
+        # Get sample if requested
+        if sample_size > 0:
+            df_sample = df.sample(min(sample_size, len(df)))
+        else:
+            df_sample = df.copy()
+        # Define analysis functions
+        def has_mixed_case(text):
+            return not (text.islower() or text.isupper())
+        def has_html_tags(text):
+            return bool(re.search("<.*?>", str(text)))
+        def has_urls(text):
+            return bool(re.search("https?://\S+|www\.\S+", str(text)))
+        def has_emails(text):
+            return bool(re.search("\S+@\S+", str(text)))
+        def has_mentions(text):
+            return bool(re.search("\B[@#]\S+", str(text)))
+        def has_emojis(text):
+            return emoji.emoji_count(str(text)) > 0
+        def has_digits(text):
+            return bool(re.search("\d", str(text)))
+        def has_punctuation(text):
+            return bool(re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', str(text)))
+        def has_dates(text):
+            return bool(re.search(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b", str(text)))
+        # Calculate metrics
+        results = {
+            "Mixed Case": df_sample[text_column].apply(has_mixed_case).sum(),
+            "HTML Tags": df_sample[text_column].apply(has_html_tags).sum(),
+            "URLs": df_sample[text_column].apply(has_urls).sum(),
+            "Email Addresses": df_sample[text_column].apply(has_emails).sum(),
+            "Mentions/Hashtags": df_sample[text_column].apply(has_mentions).sum(),
+            "Emojis": df_sample[text_column].apply(has_emojis).sum(),
+            "Digits": df_sample[text_column].apply(has_digits).sum(),
+            "Punctuation": df_sample[text_column].apply(has_punctuation).sum(),
+            "Date Formats": df_sample[text_column].apply(has_dates).sum()
+        }
+        # Display results
+        total_texts = len(df_sample)
+        for feature, count in results.items():
+            percentage = (count / total_texts) * 100
+            st.markdown(f"""
+            <div class="eda-result">
+                <h4>{feature}</h4>
+                <p><strong>{count}</strong> texts contain this feature ({percentage:.1f}% of sample)</p>
+            </div>
+            """, unsafe_allow_html=True)
+        # Show sample examples
+        st.subheader("🔍 Sample Examples")
+        for feature, count in results.items():
+            if count > 0:
+                st.write(f"**Examples with {feature}:**")
+                examples = df_sample[df_sample[text_column].apply(locals()[f"has_{feature.lower().replace(' ', '_').replace('/', '_')}"])][text_column].head(3).tolist()
+                for example in examples:
+                    st.code(example, language='text')
+                st.write("")
+else:
+    st.subheader(":violet[📃 Text Analysis Features]")
+    st.markdown("""
+    ✅ **Check Text Case** – Identify if text is in lowercase, uppercase, or mixed case
+    ✅ **Detect HTML & URL Tags** – Analyze if text contains unwanted elements
+    ✅ **Identify URLs** – Find web links in the text
+    ✅ **Detect Email Addresses** – Locate email patterns
+    ✅ **Find Mentions & Hashtags** – Identify @mentions or #hashtags
+    ✅ **Analyze Emoji Usage** – Count emoji occurrences
+    ✅ **Identify Numeric Data** – Detect digits or numerical data
+    ✅ **Check Punctuation** – Analyze punctuation usage
+    ✅ **Find Date Formats** – Identify date/time patterns
+    """)
+    st.success("🚀 Upload a CSV file to begin your text analysis!")
+# Code display section
+st.subheader(":violet[💻 Analysis Code]")
+st.code('''
+import pandas as pd
+import re
+import emoji
+def text_analysis(data, text_column):
+    """Comprehensive text analysis function"""
+    results = {}
+    # Case analysis
+    results['mixed_case'] = data[text_column].apply(
+        lambda x: not (str(x).islower() or str(x).isupper())
+    ).sum()
+    # Special patterns
+    patterns = {
+        'html_tags': r"<.*?>",
+        'urls': r"https?://\S+|www\.\S+",
+        'emails': r"\S+@\S+",
+        'mentions': r"\B[@#]\S+",
+        'digits': r"\d",
+        'punctuation': r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',
+        'dates': r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b"
+    }
+    for name, pattern in patterns.items():
+        results[name] = data[text_column].apply(
+            lambda x: bool(re.search(pattern, str(x)))
+        ).sum()
+    # Emoji analysis
+    results['emojis'] = data[text_column].apply(
+        lambda x: emoji.emoji_count(str(x)) > 0
+    ).sum()
+    return results
+''', language='python')
+st.markdown("""
+### How to Use This Analysis:
+1. **Upload** your CSV file containing text data
+2. **Select** the text column to analyze
+3. **Choose** a sample size (or use full dataset)
+4. **Run** the analysis to get detailed metrics
+5. **Review** the results to determine necessary preprocessing steps
+""")