File size: 17,490 Bytes
9968889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import io
import base64
import re
from datetime import datetime

# Page configuration
st.set_page_config(
    page_title="Data Analysis Dashboard",
    page_icon="πŸ“Š",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for better styling
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        font-weight: bold;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .metric-container {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        margin: 0.5rem 0;
    }
    .stSelectbox > div > div {
        background-color: white;
    }
    .upload-section {
        border: 2px dashed #cccccc;
        border-radius: 10px;
        padding: 2rem;
        text-align: center;
        margin: 1rem 0;
    }
</style>
""", unsafe_allow_html=True)

def convert_brazilian_number(value):
    """Convert Brazilian number format (xx.xxx.xxx,xx) to float"""
    if pd.isna(value) or value == '':
        return np.nan
    
    # Convert to string if not already
    str_value = str(value).strip()
    
    # Check if it's already a number
    try:
        return float(str_value)
    except ValueError:
        pass
    
    # Brazilian number pattern: can have dots as thousand separators and comma as decimal
    # Examples: "1.234.567,89", "1.234,56", "1234,56", "1234"
    brazilian_pattern = r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$'
    
    if re.match(brazilian_pattern, str_value):
        # Remove thousand separators (dots) and replace decimal comma with dot
        converted = str_value.replace('.', '').replace(',', '.')
        try:
            return float(converted)
        except ValueError:
            return np.nan
    
    return np.nan

def detect_and_convert_brazilian_numbers(df):
    """Detect and convert Brazilian number format columns to numeric"""
    converted_columns = []
    df_converted = df.copy()
    
    for col in df.columns:
        if df[col].dtype == 'object':  # Only check string columns
            # Sample some non-null values to check if they look like Brazilian numbers
            sample_values = df[col].dropna().astype(str).head(10)
            
            if len(sample_values) > 0:
                # Check if most values match Brazilian number pattern
                brazilian_count = 0
                total_count = 0
                
                for value in sample_values:
                    value = str(value).strip()
                    if value and value != 'nan':
                        total_count += 1
                        # Brazilian number pattern
                        if re.match(r'^-?\d{1,3}(?:\.\d{3})*(?:,\d+)?$', value) or re.match(r'^-?\d+,\d+$', value):
                            brazilian_count += 1
                
                # If more than 70% of values look like Brazilian numbers, convert the column
                if total_count > 0 and (brazilian_count / total_count) > 0.7:
                    converted_series = df[col].apply(convert_brazilian_number)
                    
                    # Only convert if we successfully converted most values
                    non_null_original = df[col].notna().sum()
                    non_null_converted = converted_series.notna().sum()
                    
                    if non_null_converted >= (non_null_original * 0.8):  # At least 80% conversion success
                        df_converted[col] = converted_series
                        converted_columns.append(col)
    
    return df_converted, converted_columns

def load_sample_data():
    """Generate sample data for demonstration"""
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'Date': pd.date_range('2023-01-01', periods=n_samples, freq='D'),
        'Sales': np.random.normal(1000, 200, n_samples),
        'Profit': np.random.normal(150, 50, n_samples),
        'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], n_samples),
        'Region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
        'Customer_Age': np.random.randint(18, 80, n_samples),
        'Rating': np.random.uniform(1, 5, n_samples)
    }
    
    df = pd.DataFrame(data)
    df['Sales'] = np.where(df['Sales'] < 0, abs(df['Sales']), df['Sales'])
    df['Profit'] = np.where(df['Category'] == 'Electronics', df['Profit'] * 1.5, df['Profit'])
    
    # Add some Brazilian formatted numbers for demonstration
    df['Vendas_BR'] = df['Sales'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))
    df['Lucro_BR'] = df['Profit'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))
    
    return df

def get_numeric_columns(df):
    """Get numeric columns from dataframe"""
    return df.select_dtypes(include=[np.number]).columns.tolist()

def get_categorical_columns(df):
    """Get categorical columns from dataframe"""
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

def create_download_link(df, filename="filtered_data.csv"):
    """Create download link for dataframe"""
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV File</a>'
    return href

def main():
    # Header
    st.markdown('<h1 class="main-header">πŸ“Š Data Analysis Dashboard</h1>', unsafe_allow_html=True)
    
    # Sidebar
    st.sidebar.title("πŸ”§ Controls")
    st.sidebar.markdown("---")
    
    # File upload section
    st.sidebar.subheader("πŸ“ Data Upload")
    uploaded_file = st.sidebar.file_uploader(
        "Choose a CSV file",
        type="csv",
        help="Upload a CSV file to analyze your data"
    )
    
    use_sample = st.sidebar.checkbox(
        "Use Sample Data",
        value=True if uploaded_file is None else False,
        help="Check this to use built-in sample data for demonstration"
    )
    
    # Brazilian number conversion option
    convert_brazilian = st.sidebar.checkbox(
        "πŸ‡§πŸ‡· Auto-convert Brazilian Numbers",
        value=True,
        help="Automatically detect and convert Brazilian number format (xx.xxx.xxx,xx) to numeric"
    )
    
    # Load data
    try:
        if uploaded_file is not None:
            df = pd.read_csv(uploaded_file)
            st.sidebar.success(f"βœ… File uploaded successfully! ({len(df)} rows)")
        elif use_sample:
            df = load_sample_data()
            st.sidebar.info("πŸ“‹ Using sample data")
        else:
            st.warning("Please upload a CSV file or use sample data to get started.")
            st.markdown("""
            ### πŸš€ Welcome to the Data Analysis Dashboard!
            
            This app helps you analyze and visualize your data with:
            - **Interactive charts** (bar, line, scatter, histogram)
            - **Dynamic filtering** and data exploration
            - **Statistical summaries** and insights
            - **Export capabilities** for data and visualizations
            - **πŸ‡§πŸ‡· Brazilian number format support** (xx.xxx.xxx,xx)
            
            **To get started:**
            1. Upload a CSV file using the sidebar, or
            2. Check "Use Sample Data" to explore with demo data
            """)
            return
        
        # Apply Brazilian number conversion if enabled
        if convert_brazilian:
            df_original = df.copy()
            df, converted_cols = detect_and_convert_brazilian_numbers(df)
            
            if converted_cols:
                st.sidebar.success(f"πŸ‡§πŸ‡· Converted {len(converted_cols)} columns from Brazilian format: {', '.join(converted_cols)}")
            
    except Exception as e:
        st.error(f"❌ Error loading file: {str(e)}")
        st.info("Please make sure your file is a valid CSV format.")
        return
    
    # Data preview section
    st.subheader("πŸ“‹ Data Preview")
    
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        st.metric("Total Rows", len(df))
    with col2:
        st.metric("Total Columns", len(df.columns))
    with col3:
        st.metric("Numeric Columns", len(get_numeric_columns(df)))
    with col4:
        st.metric("Text Columns", len(get_categorical_columns(df)))
    
    # Show data preview
    with st.expander("πŸ” View Raw Data", expanded=False):
        st.dataframe(df.head(100), use_container_width=True)
    
    # Data summary
    with st.expander("πŸ“Š Statistical Summary", expanded=False):
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Numeric Columns")
            numeric_cols = get_numeric_columns(df)
            if numeric_cols:
                st.dataframe(df[numeric_cols].describe())
            else:
                st.info("No numeric columns found")
        
        with col2:
            st.subheader("Categorical Columns")
            cat_cols = get_categorical_columns(df)
            if cat_cols:
                for col in cat_cols[:5]:  # Show first 5 categorical columns
                    st.write(f"**{col}:** {df[col].nunique()} unique values")
                    if df[col].nunique() <= 10:
                        st.write(df[col].value_counts().head())
            else:
                st.info("No categorical columns found")
    
    # Show conversion info if Brazilian conversion was applied
    if convert_brazilian and 'converted_cols' in locals() and converted_cols:
        with st.expander("πŸ‡§πŸ‡· Brazilian Number Conversion Details", expanded=False):
            st.write("**Converted Columns:**")
            for col in converted_cols:
                original_sample = df_original[col].dropna().head(3).tolist()
                converted_sample = df[col].dropna().head(3).tolist()
                st.write(f"**{col}:**")
                st.write(f"  - Original: {original_sample}")
                st.write(f"  - Converted: {converted_sample}")
    
    # Filtering section
    st.sidebar.markdown("---")
    st.sidebar.subheader("πŸ” Data Filters")
    
    # Create a copy for filtering
    filtered_df = df.copy()
    
    # Numeric filters
    numeric_cols = get_numeric_columns(df)
    for col in numeric_cols:
        if df[col].dtype in ['int64', 'float64']:
            min_val = float(df[col].min())
            max_val = float(df[col].max())
            
            if min_val != max_val:
                selected_range = st.sidebar.slider(
                    f"{col} Range",
                    min_value=min_val,
                    max_value=max_val,
                    value=(min_val, max_val),
                    help=f"Filter data by {col} values"
                )
                filtered_df = filtered_df[
                    (filtered_df[col] >= selected_range[0]) & 
                    (filtered_df[col] <= selected_range[1])
                ]
    
    # Categorical filters
    cat_cols = get_categorical_columns(df)
    for col in cat_cols:
        unique_values = df[col].unique().tolist()
        if len(unique_values) <= 50:  # Only show filter for columns with reasonable number of unique values
            selected_values = st.sidebar.multiselect(
                f"Select {col}",
                options=unique_values,
                default=unique_values,
                help=f"Filter data by {col} categories"
            )
            if selected_values:
                filtered_df = filtered_df[filtered_df[col].isin(selected_values)]
    
    # Show filtered data info
    if len(filtered_df) != len(df):
        st.sidebar.info(f"Filtered: {len(filtered_df)} of {len(df)} rows")
    
    # Visualization section
    st.markdown("---")
    st.subheader("πŸ“ˆ Data Visualization")
    
    # Chart type selection
    chart_type = st.selectbox(
        "Select Chart Type",
        ["Bar Chart", "Line Chart", "Scatter Plot", "Histogram", "Box Plot"],
        help="Choose the type of visualization"
    )
    
    col1, col2, col3 = st.columns(3)
    
    with col1:
        if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
            x_column = st.selectbox(
                "X-axis Column",
                options=df.columns.tolist(),
                help="Select column for X-axis"
            )
        else:
            x_column = st.selectbox(
                "Column to Analyze",
                options=numeric_cols,
                help="Select numeric column for histogram"
            )
    
    with col2:
        if chart_type in ["Bar Chart", "Line Chart", "Scatter Plot", "Box Plot"]:
            y_column = st.selectbox(
                "Y-axis Column",
                options=numeric_cols,
                help="Select numeric column for Y-axis"
            )
        else:
            y_column = None
    
    with col3:
        if chart_type in ["Bar Chart", "Scatter Plot", "Box Plot"]:
            color_column = st.selectbox(
                "Color/Group By (Optional)",
                options=[None] + cat_cols,
                help="Select column to group/color data"
            )
        else:
            color_column = None
    
    # Create visualization
    if chart_type == "Bar Chart" and x_column and y_column:
        if x_column in cat_cols:
            # Aggregate data for categorical x-axis
            agg_df = filtered_df.groupby(x_column)[y_column].mean().reset_index()
            fig = px.bar(
                agg_df, 
                x=x_column, 
                y=y_column,
                title=f"Average {y_column} by {x_column}",
                color=color_column if color_column and color_column in agg_df.columns else None
            )
        else:
            fig = px.bar(
                filtered_df, 
                x=x_column, 
                y=y_column,
                title=f"{y_column} vs {x_column}",
                color=color_column
            )
    
    elif chart_type == "Line Chart" and x_column and y_column:
        fig = px.line(
            filtered_df, 
            x=x_column, 
            y=y_column,
            title=f"{y_column} vs {x_column}",
            color=color_column
        )
    
    elif chart_type == "Scatter Plot" and x_column and y_column:
        fig = px.scatter(
            filtered_df, 
            x=x_column, 
            y=y_column,
            title=f"{y_column} vs {x_column}",
            color=color_column,
            size=y_column if y_column in numeric_cols else None
        )
    
    elif chart_type == "Histogram" and x_column:
        fig = px.histogram(
            filtered_df, 
            x=x_column,
            title=f"Distribution of {x_column}",
            nbins=30
        )
    
    elif chart_type == "Box Plot" and x_column and y_column:
        fig = px.box(
            filtered_df, 
            x=x_column, 
            y=y_column,
            title=f"{y_column} Distribution by {x_column}",
            color=color_column
        )
    
    else:
        st.warning("Please select appropriate columns for the chosen chart type.")
        return
    
    # Update layout for better appearance
    fig.update_layout(
        height=500,
        showlegend=True,
        title_x=0.5,
        font=dict(size=12)
    )
    
    # Display chart
    st.plotly_chart(fig, use_container_width=True)
    
    # Download section
    st.markdown("---")
    st.subheader("πŸ’Ύ Download Options")
    
    col1, col2 = st.columns(2)
    
    with col1:
        st.markdown("**Download Filtered Data**")
        if st.button("Generate CSV Download Link"):
            download_link = create_download_link(filtered_df, f"filtered_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
            st.markdown(download_link, unsafe_allow_html=True)
    
    with col2:
        st.markdown("**Download Chart**")
        if st.button("Download Chart as HTML"):
            html_string = fig.to_html(include_plotlyjs='cdn')
            st.download_button(
                label="Download HTML",
                data=html_string,
                file_name=f"chart_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html",
                mime="text/html"
            )
    
    # Additional insights
    if len(filtered_df) > 0:
        st.markdown("---")
        st.subheader("πŸ” Quick Insights")
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.markdown("**Data Overview**")
            st.write(f"β€’ Total records: {len(filtered_df):,}")
            st.write(f"β€’ Columns: {len(filtered_df.columns)}")
            
            if numeric_cols:
                st.write(f"β€’ Numeric columns: {len(numeric_cols)}")
                for col in numeric_cols[:3]:
                    mean_val = filtered_df[col].mean()
                    st.write(f"  - {col}: avg = {mean_val:.2f}")
        
        with col2:
            st.markdown("**Missing Data**")
            missing_data = filtered_df.isnull().sum()
            if missing_data.sum() > 0:
                for col, missing in missing_data.items():
                    if missing > 0:
                        pct = (missing / len(filtered_df)) * 100
                        st.write(f"β€’ {col}: {missing} ({pct:.1f}%)")
            else:
                st.write("βœ… No missing data found")

if __name__ == "__main__":
    main()