import streamlit as st
import pandas as pd
st.markdown("""
""", unsafe_allow_html=True)
st.title("Text Data Quality Analysis")
# Introduction section
st.markdown("""
Understanding Text Data Quality Analysis
Evaluating raw text data quality before processing is a critical first step in any text analysis project.
""", unsafe_allow_html=True)
st.markdown("""
Performing thorough text data quality analysis ensures structured and high-quality text data, leading to better analysis and model performance.
""", unsafe_allow_html=True)
# Code example
st.markdown("""
Implementation Example
Here's a Python function to perform basic text data quality checks:
""", unsafe_allow_html=True)
st.code('''
import pandas as pd
import re
def text_quality_analysis(data, column):
# Initialize results dictionary
results = {}
# Check for case variations
results['has_lowercase'] = data[column].str.contains('[a-z]').sum()
results['has_uppercase'] = data[column].str.contains('[A-Z]').sum()
# Check for HTML tags
results['has_html_tags'] = data[column].str.contains('<.*?>', regex=True).sum()
# Check for URLs
results['has_urls'] = data[column].str.contains('https?://\\S+', regex=True).sum()
# Check for email addresses
results['has_emails'] = data[column].str.contains('\\S+@\\S+', regex=True).sum()
# Check for mentions and hashtags
results['has_mentions'] = data[column].str.contains('@\\w+', regex=True).sum()
results['has_hashtags'] = data[column].str.contains('#\\w+', regex=True).sum()
# Check for digits
results['has_digits'] = data[column].str.contains('\\d', regex=True).sum()
# Check for punctuation
results['has_punctuation'] = data[column].str.contains('[!"#$%&\\\'()*+,-./:;<=>?@[\\\\]^_`{|}~]', regex=True).sum()
# Check for date formats (simple check)
results['has_dates'] = data[column].str.contains('\\d{1,2}/\\d{1,2}/\\d{2,4}', regex=True).sum()
return pd.DataFrame.from_dict(results, orient='index', columns=['Count'])
''', language='python')
st.markdown("""
This function provides a comprehensive analysis of text data quality by checking for various common elements that might need special handling during preprocessing.
The results can help guide your data cleaning strategy based on the specific characteristics of your text data.
""", unsafe_allow_html=True)