import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go import numpy as np import io import base64 import re from datetime import datetime # Page configuration st.set_page_config( page_title="Data Analysis Dashboard", page_icon="📊", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) def convert_brazilian_number(value): """Convert Brazilian number format (xx.xxx.xxx,xx) to float""" if pd.isna(value) or value == '': return np.nan # Convert to string if not already str_value = str(value).strip() # Check if it's already a number try: return float(str_value) except ValueError: pass # Brazilian number pattern: can have dots as thousand separators and comma as decimal # Examples: "1.234.567,89", "1.234,56", "1234,56", "1234" brazilian_pattern = r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$' if re.match(brazilian_pattern, str_value): # Remove thousand separators (dots) and replace decimal comma with dot converted = str_value.replace('.', '').replace(',', '.') try: return float(converted) except ValueError: return np.nan return np.nan def detect_and_convert_brazilian_numbers(df): """Detect and convert Brazilian number format columns to numeric""" converted_columns = [] df_converted = df.copy() for col in df.columns: if df[col].dtype == 'object': # Only check string columns # Sample some non-null values to check if they look like Brazilian numbers sample_values = df[col].dropna().astype(str).head(10) if len(sample_values) > 0: # Check if most values match Brazilian number pattern brazilian_count = 0 total_count = 0 for value in sample_values: value = str(value).strip() if value and value != 'nan': total_count += 1 # Brazilian number pattern if re.match(r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$', value) or re.match(r'^-?\d+,\d+$', value): brazilian_count += 1 # If more than 70% of values look like Brazilian numbers, convert the column if total_count > 0 and (brazilian_count / total_count) > 0.7: converted_series = df[col].apply(convert_brazilian_number) # Only convert if we successfully converted most values non_null_original = df[col].notna().sum() non_null_converted = converted_series.notna().sum() if non_null_converted >= (non_null_original * 0.8): # At least 80% conversion success df_converted[col] = converted_series converted_columns.append(col) return df_converted, converted_columns def load_sample_data(): """Generate sample data for demonstration""" np.random.seed(42) n_samples = 1000 data = { 'Date': pd.date_range('2023-01-01', periods=n_samples, freq='D'), 'Sales': np.random.normal(1000, 200, n_samples), 'Profit': np.random.normal(150, 50, n_samples), 'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], n_samples), 'Region': np.random.choice(['North', 'South', 'East', 'West'], n_samples), 'Customer_Age': np.random.randint(18, 80, n_samples), 'Rating': np.random.uniform(1, 5, n_samples) } df = pd.DataFrame(data) df['Sales'] = np.where(df['Sales'] < 0, abs(df['Sales']), df['Sales']) df['Profit'] = np.where(df['Category'] == 'Electronics', df['Profit'] * 1.5, df['Profit']) # Add some Brazilian formatted numbers for demonstration df['Vendas_BR'] = df['Sales'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.')) df['Lucro_BR'] = df['Profit'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.')) return df def get_numeric_columns(df): """Get numeric columns from dataframe""" return df.select_dtypes(include=[np.number]).columns.tolist() def get_categorical_columns(df): """Get categorical columns from dataframe""" return df.select_dtypes(include=['object', 'category']).columns.tolist() def create_download_link(df, filename="filtered_data.csv"): """Create download link for dataframe""" csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download CSV File' return href def main(): # Header st.markdown('

📊 Data Analysis Dashboard

', unsafe_allow_html=True) # Sidebar st.sidebar.title("🔧 Controls") st.sidebar.markdown("---") # File upload section st.sidebar.subheader("📁 Data Upload") uploaded_file = st.sidebar.file_uploader( "Choose a CSV file", type="csv", help="Upload a CSV file to analyze your data" ) use_sample = st.sidebar.checkbox( "Use Sample Data", value=True if uploaded_file is None else False, help="Check this to use built-in sample data for demonstration" ) # Brazilian number conversion option convert_brazilian = st.sidebar.checkbox( "🇧🇷 Auto-convert Brazilian Numbers", value=True, help="Automatically detect and convert Brazilian number format (xx.xxx.xxx,xx) to numeric" ) # Load data try: if uploaded_file is not None: df = pd.read_csv(uploaded_file) st.sidebar.success(f"✅ File uploaded successfully! ({len(df)} rows)") elif use_sample: df = load_sample_data() st.sidebar.info("📋 Using sample data") else: st.warning("Please upload a CSV file or use sample data to get started.") st.markdown(""" ### 🚀 Welcome to the Data Analysis Dashboard! This app helps you analyze and visualize your data with: - **Interactive charts** (bar, line, scatter, histogram) - **Dynamic filtering** and data exploration - **Statistical summaries** and insights - **Export capabilities** for data and visualizations - **🇧🇷 Brazilian number format support** (xx.xxx.xxx,xx) **To get started:** 1. Upload a CSV file using the sidebar, or 2. Check "Use Sample Data" to explore with demo data """) return # Apply Brazilian number conversion if enabled if convert_brazilian: df_original = df.copy() df, converted_cols = detect_and_convert_brazilian_numbers(df) if converted_cols: st.sidebar.success(f"🇧🇷 Converted {len(converted_cols)} columns from Brazilian format: {', '.join(converted_cols)}") except Exception as e: st.error(f"❌ Error loading file: {str(e)}") st.info("Please make sure your file is a valid CSV format.") return # Data preview section st.subheader("📋 Data Preview") col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Total Rows", len(df)) with col2: st.metric("Total Columns", len(df.columns)) with col3: st.metric("Numeric Columns", len(get_numeric_columns(df))) with col4: st.metric("Text Columns", len(get_categorical_columns(df))) # Show data preview with st.expander("🔍 View Raw Data", expanded=False): st.dataframe(df.head(100), use_container_width=True) # Data summary with st.expander("📊 Statistical Summary", expanded=False): col1, col2 = st.columns(2) with col1: st.subheader("Numeric Columns") numeric_cols = get_numeric_columns(df) if numeric_cols: st.dataframe(df[numeric_cols].describe()) else: st.info("No numeric columns found") with col2: st.subheader("Categorical Columns") cat_cols = get_categorical_columns(df) if cat_cols: for col in cat_cols[:5]: # Show first 5 categorical columns st.write(f"**{col}:** {df[col].nunique()} unique values") if df[col].nunique() <= 10: st.write(df[col].value_counts().head()) else: st.info("No categorical columns found") # Show conversion info if Brazilian conversion was applied if convert_brazilian and 'converted_cols' in locals() and converted_cols: with st.expander("🇧🇷 Brazilian Number Conversion Details", expanded=False): st.write("**Converted Columns:**") for col in converted_cols: original_sample = df_original[col].dropna().head(3).tolist() converted_sample = df[col].dropna().head(3).tolist() st.write(f"**{col}:**") st.write(f" - Original: {original_sample}") st.write(f" - Converted: {converted_sample}") # Filtering section st.sidebar.markdown("---") st.sidebar.subheader("🔍 Data Filters") # Create a copy for filtering filtered_df = df.copy() # Numeric filters numeric_cols = get_numeric_columns(df) for col in numeric_cols: if df[col].dtype in ['int64', 'float64']: min_val = float(df[col].min()) max_val = float(df[col].max()) if min_val != max_val: selected_range = st.sidebar.slider( f"{col} Range", min_value=min_val, max_value=max_val, value=(min_val, max_val), help=f"Filter data by {col} values" ) filtered_df = filtered_df[ (filtered_df[col] >= selected_range[0]) & (filtered_df[col] <= selected_range[1]) ] # Categorical filters cat_cols = get_categorical_columns(df) for col in cat_cols: unique_values = df[col].unique().tolist() if len(unique_values) <= 50: # Only show filter for columns with reasonable number of unique values selected_values = st.sidebar.multiselect( f"Select {col}", options=unique_values, default=unique_values, help=f"Filter data by {col} categories" ) if selected_values: filtered_df = filtered_df[filtered_df[col].isin(selected_values)] # Show filtered data info if len(filtered_df) != len(df): st.sidebar.info(f"Filtered: {len(filtered_df)} of {len(df)} rows") # Visualization section st.markdown("---") st.subheader("📈 Data Visualization") # Chart type selection chart_type = st.selectbox( "Select Chart Type", ["Bar Chart", "Line Chart", "Scatter Plot", "Histogram", "Box Plot"], help="Choose the type of visualization" ) col1, col2, col3 = st.columns(3) with col1: if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]: x_column = st.selectbox( "X-axis Column", options=df.columns.tolist(), help="Select column for X-axis" ) else: x_column = st.selectbox( "Column to Analyze", options=numeric_cols, help="Select numeric column for histogram" ) with col2: if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]: y_column = st.selectbox( "Y-axis Column", options=numeric_cols, help="Select numeric column for Y-axis" ) else: y_column = None with col3: if chart_type in ["Bar Chart", "Scatter Plot", "Box Plot"]: color_column = st.selectbox( "Color/Group By (Optional)", options=[None] + cat_cols, help="Select column to group/color data" ) else: color_column = None # Create visualization if chart_type == "Bar Chart" and x_column and y_column: if x_column in cat_cols: # Aggregate data for categorical x-axis agg_df = filtered_df.groupby(x_column)[y_column].mean().reset_index() fig = px.bar( agg_df, x=x_column, y=y_column, title=f"Average {y_column} by {x_column}", color=color_column if color_column and color_column in agg_df.columns else None ) else: fig = px.bar( filtered_df, x=x_column, y=y_column, title=f"{y_column} vs {x_column}", color=color_column ) elif chart_type == "Line Chart" and x_column and y_column: fig = px.line( filtered_df, x=x_column, y=y_column, title=f"{y_column} vs {x_column}", color=color_column ) elif chart_type == "Scatter Plot" and x_column and y_column: fig = px.scatter( filtered_df, x=x_column, y=y_column, title=f"{y_column} vs {x_column}", color=color_column, size=y_column if y_column in numeric_cols else None ) elif chart_type == "Histogram" and x_column: fig = px.histogram( filtered_df, x=x_column, title=f"Distribution of {x_column}", nbins=30 ) elif chart_type == "Box Plot" and x_column and y_column: fig = px.box( filtered_df, x=x_column, y=y_column, title=f"{y_column} Distribution by {x_column}", color=color_column ) else: st.warning("Please select appropriate columns for the chosen chart type.") return # Update layout for better appearance fig.update_layout( height=500, showlegend=True, title_x=0.5, font=dict(size=12) ) # Display chart st.plotly_chart(fig, use_container_width=True) # Download section st.markdown("---") st.subheader("💾 Download Options") col1, col2 = st.columns(2) with col1: st.markdown("**Download Filtered Data**") if st.button("Generate CSV Download Link"): download_link = create_download_link(filtered_df, f"filtered_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv") st.markdown(download_link, unsafe_allow_html=True) with col2: st.markdown("**Download Chart**") if st.button("Download Chart as HTML"): html_string = fig.to_html(include_plotlyjs='cdn') st.download_button( label="Download HTML", data=html_string, file_name=f"chart_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html", mime="text/html" ) # Additional insights if len(filtered_df) > 0: st.markdown("---") st.subheader("🔍 Quick Insights") col1, col2 = st.columns(2) with col1: st.markdown("**Data Overview**") st.write(f"• Total records: {len(filtered_df):,}") st.write(f"• Columns: {len(filtered_df.columns)}") if numeric_cols: st.write(f"• Numeric columns: {len(numeric_cols)}") for col in numeric_cols[:3]: mean_val = filtered_df[col].mean() st.write(f" - {col}: avg = {mean_val:.2f}") with col2: st.markdown("**Missing Data**") missing_data = filtered_df.isnull().sum() if missing_data.sum() > 0: for col, missing in missing_data.items(): if missing > 0: pct = (missing / len(filtered_df)) * 100 st.write(f"• {col}: {missing} ({pct:.1f}%)") else: st.write("✅ No missing data found") if __name__ == "__main__": main()