Spaces:

guifav
/

data_analytics

Sleeping

File size: 17,490 Bytes
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import io
import base64
import re
from datetime import datetime

# Page configuration
st.set_page_config(
    page_title="Data Analysis Dashboard",
    page_icon="📊",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for better styling
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        font-weight: bold;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .metric-container {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 0.5rem 0;
    }
    .stSelectbox > div > div {
        background-color: white;
    }
    .upload-section {
        border: 2px dashed #cccccc;
        border-radius: 10px;
        padding: 2rem;
        text-align: center;
        margin: 1rem 0;
    }
</style>
""", unsafe_allow_html=True)

def convert_brazilian_number(value):
    """Convert Brazilian number format (xx.xxx.xxx,xx) to float"""
    if pd.isna(value) or value == '':
        return np.nan
    
    # Convert to string if not already
    str_value = str(value).strip()
    
    # Check if it's already a number
    try:
        return float(str_value)
    except ValueError:
        pass
    
    # Brazilian number pattern: can have dots as thousand separators and comma as decimal
    # Examples: "1.234.567,89", "1.234,56", "1234,56", "1234"
    brazilian_pattern = r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$'
    
    if re.match(brazilian_pattern, str_value):
        # Remove thousand separators (dots) and replace decimal comma with dot
        converted = str_value.replace('.', '').replace(',', '.')
        try:
            return float(converted)
        except ValueError:
            return np.nan
    
    return np.nan

def detect_and_convert_brazilian_numbers(df):
    """Detect and convert Brazilian number format columns to numeric"""
    converted_columns = []
    df_converted = df.copy()
    
    for col in df.columns:
        if df[col].dtype == 'object':  # Only check string columns
            # Sample some non-null values to check if they look like Brazilian numbers
            sample_values = df[col].dropna().astype(str).head(10)
            
            if len(sample_values) > 0:
                # Check if most values match Brazilian number pattern
                brazilian_count = 0
                total_count = 0
                
                for value in sample_values:
                    value = str(value).strip()
                    if value and value != 'nan':
                        total_count += 1
                        # Brazilian number pattern
                        if re.match(r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$', value) or re.match(r'^-?\d+,\d+$', value):
                            brazilian_count += 1
                
                # If more than 70% of values look like Brazilian numbers, convert the column
                if total_count > 0 and (brazilian_count / total_count) > 0.7:
                    converted_series = df[col].apply(convert_brazilian_number)
                    
                    # Only convert if we successfully converted most values
                    non_null_original = df[col].notna().sum()
                    non_null_converted = converted_series.notna().sum()
                    
                    if non_null_converted >= (non_null_original * 0.8):  # At least 80% conversion success
                        df_converted[col] = converted_series
                        converted_columns.append(col)
    
    return df_converted, converted_columns

def load_sample_data():
    """Generate sample data for demonstration"""
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'Date': pd.date_range('2023-01-01', periods=n_samples, freq='D'),
        'Sales': np.random.normal(1000, 200, n_samples),
        'Profit': np.random.normal(150, 50, n_samples),
        'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], n_samples),
        'Region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
        'Customer_Age': np.random.randint(18, 80, n_samples),
        'Rating': np.random.uniform(1, 5, n_samples)
    }
    
    df = pd.DataFrame(data)
    df['Sales'] = np.where(df['Sales'] < 0, abs(df['Sales']), df['Sales'])
    df['Profit'] = np.where(df['Category'] == 'Electronics', df['Profit'] * 1.5, df['Profit'])
    
    # Add some Brazilian formatted numbers for demonstration
    df['Vendas_BR'] = df['Sales'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))
    df['Lucro_BR'] = df['Profit'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))
    
    return df

def get_numeric_columns(df):
    """Get numeric columns from dataframe"""
    return df.select_dtypes(include=[np.number]).columns.tolist()

def get_categorical_columns(df):
    """Get categorical columns from dataframe"""
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

def create_download_link(df, filename="filtered_data.csv"):
    """Create download link for dataframe"""
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV File</a>'
    return href

def main():
    # Header
    st.markdown('<h1 class="main-header">📊 Data Analysis Dashboard</h1>', unsafe_allow_html=True)
    
    # Sidebar
    st.sidebar.title("🔧 Controls")
    st.sidebar.markdown("---")
    
    # File upload section
    st.sidebar.subheader("📁 Data Upload")
    uploaded_file = st.sidebar.file_uploader(
        "Choose a CSV file",
        type="csv",
        help="Upload a CSV file to analyze your data"
    )
    
    use_sample = st.sidebar.checkbox(
        "Use Sample Data",
        value=True if uploaded_file is None else False,
        help="Check this to use built-in sample data for demonstration"
    )
    
    # Brazilian number conversion option
    convert_brazilian = st.sidebar.checkbox(
        "🇧🇷 Auto-convert Brazilian Numbers",
        value=True,
        help="Automatically detect and convert Brazilian number format (xx.xxx.xxx,xx) to numeric"
    )
    
    # Load data
    try:
        if uploaded_file is not None:
            df = pd.read_csv(uploaded_file)
            st.sidebar.success(f"✅ File uploaded successfully! ({len(df)} rows)")
        elif use_sample:
            df = load_sample_data()
            st.sidebar.info("📋 Using sample data")
        else:
            st.warning("Please upload a CSV file or use sample data to get started.")
            st.markdown("""
            ### 🚀 Welcome to the Data Analysis Dashboard!
            
            This app helps you analyze and visualize your data with:
            - **Interactive charts** (bar, line, scatter, histogram)
            - **Dynamic filtering** and data exploration
            - **Statistical summaries** and insights
            - **Export capabilities** for data and visualizations
            - **🇧🇷 Brazilian number format support** (xx.xxx.xxx,xx)
            
            **To get started:**
            1. Upload a CSV file using the sidebar, or
            2. Check "Use Sample Data" to explore with demo data
            """)
            return
        
        # Apply Brazilian number conversion if enabled
        if convert_brazilian:
            df_original = df.copy()
            df, converted_cols = detect_and_convert_brazilian_numbers(df)
            
            if converted_cols:
                st.sidebar.success(f"🇧🇷 Converted {len(converted_cols)} columns from Brazilian format: {', '.join(converted_cols)}")
            
    except Exception as e:
        st.error(f"❌ Error loading file: {str(e)}")
        st.info("Please make sure your file is a valid CSV format.")
        return
    
    # Data preview section
    st.subheader("📋 Data Preview")
    
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Total Rows", len(df))
    with col2:
        st.metric("Total Columns", len(df.columns))
    with col3:
        st.metric("Numeric Columns", len(get_numeric_columns(df)))
    with col4:
        st.metric("Text Columns", len(get_categorical_columns(df)))
    
    # Show data preview
    with st.expander("🔍 View Raw Data", expanded=False):
        st.dataframe(df.head(100), use_container_width=True)
    
    # Data summary
    with st.expander("📊 Statistical Summary", expanded=False):
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Numeric Columns")
            numeric_cols = get_numeric_columns(df)
            if numeric_cols:
                st.dataframe(df[numeric_cols].describe())
            else:
                st.info("No numeric columns found")
        
        with col2:
            st.subheader("Categorical Columns")
            cat_cols = get_categorical_columns(df)
            if cat_cols:
                for col in cat_cols[:5]:  # Show first 5 categorical columns
                    st.write(f"**{col}:** {df[col].nunique()} unique values")
                    if df[col].nunique() <= 10:
                        st.write(df[col].value_counts().head())
            else:
                st.info("No categorical columns found")
    
    # Show conversion info if Brazilian conversion was applied
    if convert_brazilian and 'converted_cols' in locals() and converted_cols:
        with st.expander("🇧🇷 Brazilian Number Conversion Details", expanded=False):
            st.write("**Converted Columns:**")
            for col in converted_cols:
                original_sample = df_original[col].dropna().head(3).tolist()
                converted_sample = df[col].dropna().head(3).tolist()
                st.write(f"**{col}:**")
                st.write(f"  - Original: {original_sample}")
                st.write(f"  - Converted: {converted_sample}")
    
    # Filtering section
    st.sidebar.markdown("---")
    st.sidebar.subheader("🔍 Data Filters")
    
    # Create a copy for filtering
    filtered_df = df.copy()
    
    # Numeric filters
    numeric_cols = get_numeric_columns(df)
    for col in numeric_cols:
        if df[col].dtype in ['int64', 'float64']:
            min_val = float(df[col].min())
            max_val = float(df[col].max())
            
            if min_val != max_val:
                selected_range = st.sidebar.slider(
                    f"{col} Range",
                    min_value=min_val,
                    max_value=max_val,
                    value=(min_val, max_val),
                    help=f"Filter data by {col} values"
                )
                filtered_df = filtered_df[
                    (filtered_df[col] >= selected_range[0]) & 
                    (filtered_df[col] <= selected_range[1])
                ]
    
    # Categorical filters
    cat_cols = get_categorical_columns(df)
    for col in cat_cols:
        unique_values = df[col].unique().tolist()
        if len(unique_values) <= 50:  # Only show filter for columns with reasonable number of unique values
            selected_values = st.sidebar.multiselect(
                f"Select {col}",
                options=unique_values,
                default=unique_values,
                help=f"Filter data by {col} categories"
            )
            if selected_values:
                filtered_df = filtered_df[filtered_df[col].isin(selected_values)]
    
    # Show filtered data info
    if len(filtered_df) != len(df):
        st.sidebar.info(f"Filtered: {len(filtered_df)} of {len(df)} rows")
    
    # Visualization section
    st.markdown("---")
    st.subheader("📈 Data Visualization")
    
    # Chart type selection
    chart_type = st.selectbox(
        "Select Chart Type",
        ["Bar Chart", "Line Chart", "Scatter Plot", "Histogram", "Box Plot"],
        help="Choose the type of visualization"
    )
    
    col1, col2, col3 = st.columns(3)
    
    with col1:
        if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
            x_column = st.selectbox(
                "X-axis Column",
                options=df.columns.tolist(),
                help="Select column for X-axis"
            )
        else:
            x_column = st.selectbox(
                "Column to Analyze",
                options=numeric_cols,
                help="Select numeric column for histogram"
            )
    
    with col2:
        if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
            y_column = st.selectbox(
                "Y-axis Column",
                options=numeric_cols,
                help="Select numeric column for Y-axis"
            )
        else:
            y_column = None
    
    with col3:
        if chart_type in ["Bar Chart", "Scatter Plot", "Box Plot"]:
            color_column = st.selectbox(
                "Color/Group By (Optional)",
                options=[None] + cat_cols,
                help="Select column to group/color data"
            )
        else:
            color_column = None
    
    # Create visualization
    if chart_type == "Bar Chart" and x_column and y_column:
        if x_column in cat_cols:
            # Aggregate data for categorical x-axis
            agg_df = filtered_df.groupby(x_column)[y_column].mean().reset_index()
            fig = px.bar(
                agg_df, 
                x=x_column, 
                y=y_column,
                title=f"Average {y_column} by {x_column}",
                color=color_column if color_column and color_column in agg_df.columns else None
            )
        else:
            fig = px.bar(
                filtered_df, 
                x=x_column, 
                y=y_column,
                title=f"{y_column} vs {x_column}",
                color=color_column
            )
    
    elif chart_type == "Line Chart" and x_column and y_column:
        fig = px.line(
            filtered_df, 
            x=x_column, 
            y=y_column,
            title=f"{y_column} vs {x_column}",
            color=color_column
        )
    
    elif chart_type == "Scatter Plot" and x_column and y_column:
        fig = px.scatter(
            filtered_df, 
            x=x_column, 
            y=y_column,
            title=f"{y_column} vs {x_column}",
            color=color_column,
            size=y_column if y_column in numeric_cols else None
        )
    
    elif chart_type == "Histogram" and x_column:
        fig = px.histogram(
            filtered_df, 
            x=x_column,
            title=f"Distribution of {x_column}",
            nbins=30
        )
    
    elif chart_type == "Box Plot" and x_column and y_column:
        fig = px.box(
            filtered_df, 
            x=x_column, 
            y=y_column,
            title=f"{y_column} Distribution by {x_column}",
            color=color_column
        )
    
    else:
        st.warning("Please select appropriate columns for the chosen chart type.")
        return
    
    # Update layout for better appearance
    fig.update_layout(
        height=500,
        showlegend=True,
        title_x=0.5,
        font=dict(size=12)
    )
    
    # Display chart
    st.plotly_chart(fig, use_container_width=True)
    
    # Download section
    st.markdown("---")
    st.subheader("💾 Download Options")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown("**Download Filtered Data**")
        if st.button("Generate CSV Download Link"):
            download_link = create_download_link(filtered_df, f"filtered_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
            st.markdown(download_link, unsafe_allow_html=True)
    
    with col2:
        st.markdown("**Download Chart**")
        if st.button("Download Chart as HTML"):
            html_string = fig.to_html(include_plotlyjs='cdn')
            st.download_button(
                label="Download HTML",
                data=html_string,
                file_name=f"chart_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html",
                mime="text/html"
            )
    
    # Additional insights
    if len(filtered_df) > 0:
        st.markdown("---")
        st.subheader("🔍 Quick Insights")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("**Data Overview**")
            st.write(f"• Total records: {len(filtered_df):,}")
            st.write(f"• Columns: {len(filtered_df.columns)}")
            
            if numeric_cols:
                st.write(f"• Numeric columns: {len(numeric_cols)}")
                for col in numeric_cols[:3]:
                    mean_val = filtered_df[col].mean()
                    st.write(f"  - {col}: avg = {mean_val:.2f}")
        
        with col2:
            st.markdown("**Missing Data**")
            missing_data = filtered_df.isnull().sum()
            if missing_data.sum() > 0:
                for col, missing in missing_data.items():
                    if missing > 0:
                        pct = (missing / len(filtered_df)) * 100
                        st.write(f"• {col}: {missing} ({pct:.1f}%)")
            else:
                st.write("✅ No missing data found")

if __name__ == "__main__":
    main()