""" Data Analysis Example Demonstrates data manipulation and visualization capabilities """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime, timedelta # Generate sample data np.random.seed(42) dates = pd.date_range(start='2023-01-01', periods=365, freq='D') values = np.cumsum(np.random.randn(365)) + 100 df = pd.DataFrame({ 'date': dates, 'value': values, 'category': np.random.choice(['A', 'B', 'C'], 365) }) print("=" * 60) print("DATA ANALYSIS EXAMPLE") print("=" * 60) # Basic statistics print("\nšŸ“Š Dataset Overview:") print(f"Total records: {len(df)}") print(f"Date range: {df['date'].min()} to {df['date'].max()}") print(f"Categories: {df['category'].unique()}") print("\nšŸ“ˆ Basic Statistics:") print(df.describe()) # Category distribution print("\nšŸ“‹ Category Distribution:") category_counts = df['category'].value_counts() print(category_counts) # Time series analysis print("\nā° Time Series Analysis:") monthly_avg = df.groupby(df['date'].dt.month)['value'].mean() print(monthly_avg) # Create visualizations plt.figure(figsize=(15, 10)) # Plot 1: Time series plt.subplot(2, 2, 1) plt.plot(df['date'], df['value']) plt.title('Time Series Data') plt.xlabel('Date') plt.ylabel('Value') plt.xticks(rotation=45) # Plot 2: Histogram plt.subplot(2, 2, 2) plt.hist(df['value'], bins=30, alpha=0.7, color='skyblue') plt.title('Value Distribution') plt.xlabel('Value') plt.ylabel('Frequency') # Plot 3: Category boxplot plt.subplot(2, 2, 3) sns.boxplot(data=df, x='category', y='value') plt.title('Value by Category') plt.xlabel('Category') plt.ylabel('Value') # Plot 4: Scatter plot plt.subplot(2, 2, 4) colors = {'A': 'red', 'B': 'blue', 'C': 'green'} for category in df['category'].unique(): subset = df[df['category'] == category] plt.scatter(subset.index, subset['value'], c=colors[category], label=category, alpha=0.6) plt.title('Scatter Plot by Category') plt.xlabel('Index') plt.ylabel('Value') plt.legend() plt.tight_layout() plt.show() # Advanced analysis print("\nšŸ” Advanced Analysis:") # Correlation correlation = df['value'].corr(df.index) print(f"Correlation with time: {correlation:.4f}") # Rolling statistics rolling_mean = df['value'].rolling(window=30).mean() print(f"30-day rolling mean (latest): {rolling_mean.iloc[-1]:.2f}") # Growth rate growth_rate = (df['value'].iloc[-1] - df['value'].iloc[0]) / df['value'].iloc[0] * 100 print(f"Total growth rate: {growth_rate:.2f}%") print("\nāœ… Data analysis complete!")