import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import io
import base64
import re
from datetime import datetime
# Page configuration
st.set_page_config(
page_title="Data Analysis Dashboard",
page_icon="📊",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS for better styling
st.markdown("""
""", unsafe_allow_html=True)
def convert_brazilian_number(value):
"""Convert Brazilian number format (xx.xxx.xxx,xx) to float"""
if pd.isna(value) or value == '':
return np.nan
# Convert to string if not already
str_value = str(value).strip()
# Check if it's already a number
try:
return float(str_value)
except ValueError:
pass
# Brazilian number pattern: can have dots as thousand separators and comma as decimal
# Examples: "1.234.567,89", "1.234,56", "1234,56", "1234"
brazilian_pattern = r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$'
if re.match(brazilian_pattern, str_value):
# Remove thousand separators (dots) and replace decimal comma with dot
converted = str_value.replace('.', '').replace(',', '.')
try:
return float(converted)
except ValueError:
return np.nan
return np.nan
def detect_and_convert_brazilian_numbers(df):
"""Detect and convert Brazilian number format columns to numeric"""
converted_columns = []
df_converted = df.copy()
for col in df.columns:
if df[col].dtype == 'object': # Only check string columns
# Sample some non-null values to check if they look like Brazilian numbers
sample_values = df[col].dropna().astype(str).head(10)
if len(sample_values) > 0:
# Check if most values match Brazilian number pattern
brazilian_count = 0
total_count = 0
for value in sample_values:
value = str(value).strip()
if value and value != 'nan':
total_count += 1
# Brazilian number pattern
if re.match(r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$', value) or re.match(r'^-?\d+,\d+$', value):
brazilian_count += 1
# If more than 70% of values look like Brazilian numbers, convert the column
if total_count > 0 and (brazilian_count / total_count) > 0.7:
converted_series = df[col].apply(convert_brazilian_number)
# Only convert if we successfully converted most values
non_null_original = df[col].notna().sum()
non_null_converted = converted_series.notna().sum()
if non_null_converted >= (non_null_original * 0.8): # At least 80% conversion success
df_converted[col] = converted_series
converted_columns.append(col)
return df_converted, converted_columns
def load_sample_data():
"""Generate sample data for demonstration"""
np.random.seed(42)
n_samples = 1000
data = {
'Date': pd.date_range('2023-01-01', periods=n_samples, freq='D'),
'Sales': np.random.normal(1000, 200, n_samples),
'Profit': np.random.normal(150, 50, n_samples),
'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], n_samples),
'Region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
'Customer_Age': np.random.randint(18, 80, n_samples),
'Rating': np.random.uniform(1, 5, n_samples)
}
df = pd.DataFrame(data)
df['Sales'] = np.where(df['Sales'] < 0, abs(df['Sales']), df['Sales'])
df['Profit'] = np.where(df['Category'] == 'Electronics', df['Profit'] * 1.5, df['Profit'])
# Add some Brazilian formatted numbers for demonstration
df['Vendas_BR'] = df['Sales'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))
df['Lucro_BR'] = df['Profit'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))
return df
def get_numeric_columns(df):
"""Get numeric columns from dataframe"""
return df.select_dtypes(include=[np.number]).columns.tolist()
def get_categorical_columns(df):
"""Get categorical columns from dataframe"""
return df.select_dtypes(include=['object', 'category']).columns.tolist()
def create_download_link(df, filename="filtered_data.csv"):
"""Create download link for dataframe"""
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'Download CSV File'
return href
def main():
# Header
st.markdown('
📊 Data Analysis Dashboard
', unsafe_allow_html=True)
# Sidebar
st.sidebar.title("🔧 Controls")
st.sidebar.markdown("---")
# File upload section
st.sidebar.subheader("📁 Data Upload")
uploaded_file = st.sidebar.file_uploader(
"Choose a CSV file",
type="csv",
help="Upload a CSV file to analyze your data"
)
use_sample = st.sidebar.checkbox(
"Use Sample Data",
value=True if uploaded_file is None else False,
help="Check this to use built-in sample data for demonstration"
)
# Brazilian number conversion option
convert_brazilian = st.sidebar.checkbox(
"🇧🇷 Auto-convert Brazilian Numbers",
value=True,
help="Automatically detect and convert Brazilian number format (xx.xxx.xxx,xx) to numeric"
)
# Load data
try:
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
st.sidebar.success(f"✅ File uploaded successfully! ({len(df)} rows)")
elif use_sample:
df = load_sample_data()
st.sidebar.info("📋 Using sample data")
else:
st.warning("Please upload a CSV file or use sample data to get started.")
st.markdown("""
### 🚀 Welcome to the Data Analysis Dashboard!
This app helps you analyze and visualize your data with:
- **Interactive charts** (bar, line, scatter, histogram)
- **Dynamic filtering** and data exploration
- **Statistical summaries** and insights
- **Export capabilities** for data and visualizations
- **🇧🇷 Brazilian number format support** (xx.xxx.xxx,xx)
**To get started:**
1. Upload a CSV file using the sidebar, or
2. Check "Use Sample Data" to explore with demo data
""")
return
# Apply Brazilian number conversion if enabled
if convert_brazilian:
df_original = df.copy()
df, converted_cols = detect_and_convert_brazilian_numbers(df)
if converted_cols:
st.sidebar.success(f"🇧🇷 Converted {len(converted_cols)} columns from Brazilian format: {', '.join(converted_cols)}")
except Exception as e:
st.error(f"❌ Error loading file: {str(e)}")
st.info("Please make sure your file is a valid CSV format.")
return
# Data preview section
st.subheader("📋 Data Preview")
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Rows", len(df))
with col2:
st.metric("Total Columns", len(df.columns))
with col3:
st.metric("Numeric Columns", len(get_numeric_columns(df)))
with col4:
st.metric("Text Columns", len(get_categorical_columns(df)))
# Show data preview
with st.expander("🔍 View Raw Data", expanded=False):
st.dataframe(df.head(100), use_container_width=True)
# Data summary
with st.expander("📊 Statistical Summary", expanded=False):
col1, col2 = st.columns(2)
with col1:
st.subheader("Numeric Columns")
numeric_cols = get_numeric_columns(df)
if numeric_cols:
st.dataframe(df[numeric_cols].describe())
else:
st.info("No numeric columns found")
with col2:
st.subheader("Categorical Columns")
cat_cols = get_categorical_columns(df)
if cat_cols:
for col in cat_cols[:5]: # Show first 5 categorical columns
st.write(f"**{col}:** {df[col].nunique()} unique values")
if df[col].nunique() <= 10:
st.write(df[col].value_counts().head())
else:
st.info("No categorical columns found")
# Show conversion info if Brazilian conversion was applied
if convert_brazilian and 'converted_cols' in locals() and converted_cols:
with st.expander("🇧🇷 Brazilian Number Conversion Details", expanded=False):
st.write("**Converted Columns:**")
for col in converted_cols:
original_sample = df_original[col].dropna().head(3).tolist()
converted_sample = df[col].dropna().head(3).tolist()
st.write(f"**{col}:**")
st.write(f" - Original: {original_sample}")
st.write(f" - Converted: {converted_sample}")
# Filtering section
st.sidebar.markdown("---")
st.sidebar.subheader("🔍 Data Filters")
# Create a copy for filtering
filtered_df = df.copy()
# Numeric filters
numeric_cols = get_numeric_columns(df)
for col in numeric_cols:
if df[col].dtype in ['int64', 'float64']:
min_val = float(df[col].min())
max_val = float(df[col].max())
if min_val != max_val:
selected_range = st.sidebar.slider(
f"{col} Range",
min_value=min_val,
max_value=max_val,
value=(min_val, max_val),
help=f"Filter data by {col} values"
)
filtered_df = filtered_df[
(filtered_df[col] >= selected_range[0]) &
(filtered_df[col] <= selected_range[1])
]
# Categorical filters
cat_cols = get_categorical_columns(df)
for col in cat_cols:
unique_values = df[col].unique().tolist()
if len(unique_values) <= 50: # Only show filter for columns with reasonable number of unique values
selected_values = st.sidebar.multiselect(
f"Select {col}",
options=unique_values,
default=unique_values,
help=f"Filter data by {col} categories"
)
if selected_values:
filtered_df = filtered_df[filtered_df[col].isin(selected_values)]
# Show filtered data info
if len(filtered_df) != len(df):
st.sidebar.info(f"Filtered: {len(filtered_df)} of {len(df)} rows")
# Visualization section
st.markdown("---")
st.subheader("📈 Data Visualization")
# Chart type selection
chart_type = st.selectbox(
"Select Chart Type",
["Bar Chart", "Line Chart", "Scatter Plot", "Histogram", "Box Plot"],
help="Choose the type of visualization"
)
col1, col2, col3 = st.columns(3)
with col1:
if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
x_column = st.selectbox(
"X-axis Column",
options=df.columns.tolist(),
help="Select column for X-axis"
)
else:
x_column = st.selectbox(
"Column to Analyze",
options=numeric_cols,
help="Select numeric column for histogram"
)
with col2:
if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
y_column = st.selectbox(
"Y-axis Column",
options=numeric_cols,
help="Select numeric column for Y-axis"
)
else:
y_column = None
with col3:
if chart_type in ["Bar Chart", "Scatter Plot", "Box Plot"]:
color_column = st.selectbox(
"Color/Group By (Optional)",
options=[None] + cat_cols,
help="Select column to group/color data"
)
else:
color_column = None
# Create visualization
if chart_type == "Bar Chart" and x_column and y_column:
if x_column in cat_cols:
# Aggregate data for categorical x-axis
agg_df = filtered_df.groupby(x_column)[y_column].mean().reset_index()
fig = px.bar(
agg_df,
x=x_column,
y=y_column,
title=f"Average {y_column} by {x_column}",
color=color_column if color_column and color_column in agg_df.columns else None
)
else:
fig = px.bar(
filtered_df,
x=x_column,
y=y_column,
title=f"{y_column} vs {x_column}",
color=color_column
)
elif chart_type == "Line Chart" and x_column and y_column:
fig = px.line(
filtered_df,
x=x_column,
y=y_column,
title=f"{y_column} vs {x_column}",
color=color_column
)
elif chart_type == "Scatter Plot" and x_column and y_column:
fig = px.scatter(
filtered_df,
x=x_column,
y=y_column,
title=f"{y_column} vs {x_column}",
color=color_column,
size=y_column if y_column in numeric_cols else None
)
elif chart_type == "Histogram" and x_column:
fig = px.histogram(
filtered_df,
x=x_column,
title=f"Distribution of {x_column}",
nbins=30
)
elif chart_type == "Box Plot" and x_column and y_column:
fig = px.box(
filtered_df,
x=x_column,
y=y_column,
title=f"{y_column} Distribution by {x_column}",
color=color_column
)
else:
st.warning("Please select appropriate columns for the chosen chart type.")
return
# Update layout for better appearance
fig.update_layout(
height=500,
showlegend=True,
title_x=0.5,
font=dict(size=12)
)
# Display chart
st.plotly_chart(fig, use_container_width=True)
# Download section
st.markdown("---")
st.subheader("💾 Download Options")
col1, col2 = st.columns(2)
with col1:
st.markdown("**Download Filtered Data**")
if st.button("Generate CSV Download Link"):
download_link = create_download_link(filtered_df, f"filtered_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
st.markdown(download_link, unsafe_allow_html=True)
with col2:
st.markdown("**Download Chart**")
if st.button("Download Chart as HTML"):
html_string = fig.to_html(include_plotlyjs='cdn')
st.download_button(
label="Download HTML",
data=html_string,
file_name=f"chart_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html",
mime="text/html"
)
# Additional insights
if len(filtered_df) > 0:
st.markdown("---")
st.subheader("🔍 Quick Insights")
col1, col2 = st.columns(2)
with col1:
st.markdown("**Data Overview**")
st.write(f"• Total records: {len(filtered_df):,}")
st.write(f"• Columns: {len(filtered_df.columns)}")
if numeric_cols:
st.write(f"• Numeric columns: {len(numeric_cols)}")
for col in numeric_cols[:3]:
mean_val = filtered_df[col].mean()
st.write(f" - {col}: avg = {mean_val:.2f}")
with col2:
st.markdown("**Missing Data**")
missing_data = filtered_df.isnull().sum()
if missing_data.sum() > 0:
for col, missing in missing_data.items():
if missing > 0:
pct = (missing / len(filtered_df)) * 100
st.write(f"• {col}: {missing} ({pct:.1f}%)")
else:
st.write("✅ No missing data found")
if __name__ == "__main__":
main()