# Variance and Standard Deviation Tutorial # A comprehensive lesson on understanding spread in data import numpy as np import matplotlib.pyplot as plt import pandas as pd from scipy import stats # Set style for better-looking plots plt.style.use('seaborn-v0_8-darkgrid') print("=" * 60) print("VARIANCE AND STANDARD DEVIATION TUTORIAL") print("=" * 60) # ============================================================================ # SECTION 1: Understanding the Basics # ============================================================================ print("\n### SECTION 1: What is Variance and Standard Deviation? ###\n") # Create two datasets with same mean but different spreads np.random.seed(42) data_low_spread = np.random.normal(50, 5, 100) # mean=50, std=5 data_high_spread = np.random.normal(50, 15, 100) # mean=50, std=15 print(f"Dataset 1 - Low Spread:") print(f" Mean: {np.mean(data_low_spread):.2f}") print(f" Variance: {np.var(data_low_spread, ddof=1):.2f}") print(f" Standard Deviation: {np.std(data_low_spread, ddof=1):.2f}") print(f"\nDataset 2 - High Spread:") print(f" Mean: {np.mean(data_high_spread):.2f}") print(f" Variance: {np.var(data_high_spread, ddof=1):.2f}") print(f" Standard Deviation: {np.std(data_high_spread, ddof=1):.2f}") # Visualize the difference fig, axes = plt.subplots(1, 2, figsize=(14, 5)) axes[0].hist(data_low_spread, bins=20, alpha=0.7, color='skyblue', edgecolor='black') axes[0].axvline(np.mean(data_low_spread), color='red', linestyle='--', linewidth=2, label='Mean') axes[0].set_title('Low Spread Data (σ ≈ 5)', fontsize=14, fontweight='bold') axes[0].set_xlabel('Value') axes[0].set_ylabel('Frequency') axes[0].legend() axes[0].grid(alpha=0.3) axes[1].hist(data_high_spread, bins=20, alpha=0.7, color='salmon', edgecolor='black') axes[1].axvline(np.mean(data_high_spread), color='red', linestyle='--', linewidth=2, label='Mean') axes[1].set_title('High Spread Data (σ ≈ 15)', fontsize=14, fontweight='bold') axes[1].set_xlabel('Value') axes[1].set_ylabel('Frequency') axes[1].legend() axes[1].grid(alpha=0.3) plt.tight_layout() plt.show() print("\nKey Insight: Both datasets have similar means, but very different spreads!") # ============================================================================ # SECTION 2: Calculating Variance and Standard Deviation Step-by-Step # ============================================================================ print("\n\n### SECTION 2: Manual Calculation ###\n") # Simple dataset for manual calculation simple_data = np.array([2, 4, 4, 4, 5, 5, 7, 9]) print(f"Dataset: {simple_data}") print(f"Number of values (n): {len(simple_data)}") # Step 1: Calculate mean mean = np.mean(simple_data) print(f"\nStep 1 - Mean: {mean:.2f}") # Step 2: Calculate deviations from mean deviations = simple_data - mean print(f"\nStep 2 - Deviations from mean:") for i, (val, dev) in enumerate(zip(simple_data, deviations)): print(f" Value {val}: {val} - {mean:.2f} = {dev:.2f}") # Step 3: Square the deviations squared_deviations = deviations ** 2 print(f"\nStep 3 - Squared deviations:") for i, (dev, sq_dev) in enumerate(zip(deviations, squared_deviations)): print(f" ({dev:.2f})² = {sq_dev:.2f}") # Step 4: Calculate variance (sample variance, using n-1) variance = np.sum(squared_deviations) / (len(simple_data) - 1) print(f"\nStep 4 - Variance (s²):") print(f" Sum of squared deviations / (n-1)") print(f" {np.sum(squared_deviations):.2f} / {len(simple_data) - 1} = {variance:.2f}") # Step 5: Calculate standard deviation std_dev = np.sqrt(variance) print(f"\nStep 5 - Standard Deviation (s):") print(f" √{variance:.2f} = {std_dev:.2f}") # Verify with NumPy print(f"\nVerification with NumPy:") print(f" np.var(data, ddof=1) = {np.var(simple_data, ddof=1):.2f}") print(f" np.std(data, ddof=1) = {np.std(simple_data, ddof=1):.2f}") # ============================================================================ # SECTION 3: Visualizing Standard Deviation # ============================================================================ print("\n\n### SECTION 3: Visualizing Standard Deviation ###\n") # Generate normal distribution mu = 100 sigma = 15 x = np.linspace(mu - 4*sigma, mu + 4*sigma, 1000) y = stats.norm.pdf(x, mu, sigma) # Create figure fig, ax = plt.subplots(figsize=(14, 7)) # Plot the distribution ax.plot(x, y, 'b-', linewidth=2, label='Normal Distribution') ax.fill_between(x, y, alpha=0.1, color='blue') # Mark mean ax.axvline(mu, color='red', linestyle='--', linewidth=2, label=f'Mean (μ = {mu})') # Mark standard deviations colors = ['green', 'orange', 'purple'] for i in range(1, 4): # Positive side ax.axvline(mu + i*sigma, color=colors[i-1], linestyle=':', linewidth=1.5, alpha=0.7, label=f'±{i}σ ({mu + i*sigma:.0f})') # Negative side ax.axvline(mu - i*sigma, color=colors[i-1], linestyle=':', linewidth=1.5, alpha=0.7) # Shade regions mask = (x >= mu + (i-1)*sigma) & (x <= mu + i*sigma) ax.fill_between(x[mask], y[mask], alpha=0.2, color=colors[i-1]) mask = (x >= mu - i*sigma) & (x <= mu - (i-1)*sigma) ax.fill_between(x[mask], y[mask], alpha=0.2, color=colors[i-1]) # Add percentage labels ax.text(mu, max(y)*0.5, '68.27%\n(±1σ)', ha='center', fontsize=11, bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.7)) ax.text(mu + 2*sigma, max(y)*0.2, '95.45%\n(±2σ)', ha='center', fontsize=11, bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.7)) ax.text(mu + 3*sigma, max(y)*0.05, '99.73%\n(±3σ)', ha='center', fontsize=11, bbox=dict(boxstyle='round', facecolor='lightcoral', alpha=0.7)) ax.set_xlabel('Value', fontsize=12) ax.set_ylabel('Probability Density', fontsize=12) ax.set_title('The 68-95-99.7 Rule (Empirical Rule)', fontsize=16, fontweight='bold') ax.legend(loc='upper right') ax.grid(alpha=0.3) plt.tight_layout() plt.show() print("The Empirical Rule:") print(" • ~68% of data falls within ±1 standard deviation") print(" • ~95% of data falls within ±2 standard deviations") print(" • ~99.7% of data falls within ±3 standard deviations") # ============================================================================ # SECTION 4: Real-World Example - Stock Prices # ============================================================================ print("\n\n### SECTION 4: Real-World Example - Stock Returns ###\n") # Simulate stock returns for two companies days = 252 # trading days in a year np.random.seed(123) stock_a_returns = np.random.normal(0.05, 0.02, days) # Low volatility stock_b_returns = np.random.normal(0.05, 0.08, days) # High volatility # Calculate cumulative returns stock_a_cumulative = np.cumprod(1 + stock_a_returns) * 100 stock_b_cumulative = np.cumprod(1 + stock_b_returns) * 100 # Statistics print("Stock A (Conservative):") print(f" Mean Daily Return: {np.mean(stock_a_returns)*100:.3f}%") print(f" Std Dev (Volatility): {np.std(stock_a_returns, ddof=1)*100:.3f}%") print("\nStock B (Aggressive):") print(f" Mean Daily Return: {np.mean(stock_b_returns)*100:.3f}%") print(f" Std Dev (Volatility): {np.std(stock_b_returns, ddof=1)*100:.3f}%") # Create comparison plot fig, axes = plt.subplots(2, 2, figsize=(15, 10)) # Price evolution axes[0, 0].plot(stock_a_cumulative, label='Stock A', linewidth=2, color='blue') axes[0, 0].plot(stock_b_cumulative, label='Stock B', linewidth=2, color='red') axes[0, 0].set_title('Cumulative Price Performance', fontsize=14, fontweight='bold') axes[0, 0].set_xlabel('Trading Days') axes[0, 0].set_ylabel('Price ($)') axes[0, 0].legend() axes[0, 0].grid(alpha=0.3) # Returns distribution - Stock A axes[0, 1].hist(stock_a_returns * 100, bins=30, alpha=0.7, color='blue', edgecolor='black') axes[0, 1].axvline(np.mean(stock_a_returns) * 100, color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(stock_a_returns)*100:.3f}%') axes[0, 1].set_title('Stock A - Return Distribution', fontsize=14, fontweight='bold') axes[0, 1].set_xlabel('Daily Return (%)') axes[0, 1].set_ylabel('Frequency') axes[0, 1].legend() axes[0, 1].grid(alpha=0.3) # Returns distribution - Stock B axes[1, 0].hist(stock_b_returns * 100, bins=30, alpha=0.7, color='red', edgecolor='black') axes[1, 0].axvline(np.mean(stock_b_returns) * 100, color='darkred', linestyle='--', linewidth=2, label=f'Mean: {np.mean(stock_b_returns)*100:.3f}%') axes[1, 0].set_title('Stock B - Return Distribution', fontsize=14, fontweight='bold') axes[1, 0].set_xlabel('Daily Return (%)') axes[1, 0].set_ylabel('Frequency') axes[1, 0].legend() axes[1, 0].grid(alpha=0.3) # Box plot comparison axes[1, 1].boxplot([stock_a_returns * 100, stock_b_returns * 100], labels=['Stock A', 'Stock B'], patch_artist=True, boxprops=dict(facecolor='lightblue'), medianprops=dict(color='red', linewidth=2)) axes[1, 1].set_title('Risk Comparison (Box Plot)', fontsize=14, fontweight='bold') axes[1, 1].set_ylabel('Daily Return (%)') axes[1, 1].grid(alpha=0.3, axis='y') plt.tight_layout() plt.show() # ============================================================================ # SECTION 5: Population vs Sample Variance # ============================================================================ print("\n\n### SECTION 5: Population vs Sample Variance ###\n") population = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) print(f"Population: {population}") print(f"\nPopulation Variance (σ²) - divide by n:") print(f" {np.var(population, ddof=0):.2f}") print(f"\nSample Variance (s²) - divide by n-1:") print(f" {np.var(population, ddof=1):.2f}") print("\nWhy n-1? Bessel's correction accounts for bias when estimating") print("population variance from a sample.") # ============================================================================ # SUMMARY # ============================================================================ print("\n\n" + "=" * 60) print("KEY TAKEAWAYS") print("=" * 60) print(""" 1. VARIANCE (σ² or s²): - Average of squared deviations from the mean - Units are squared (e.g., dollars²) - Larger variance = more spread out data 2. STANDARD DEVIATION (σ or s): - Square root of variance - Same units as original data - More interpretable than variance 3. WHY THEY MATTER: - Measure risk/volatility in finance - Assess data quality and consistency - Compare variability between datasets - Foundation for many statistical tests 4. REMEMBER: - Use ddof=1 for sample statistics (most common) - Use ddof=0 for population statistics - Larger std dev = more uncertainty/risk """)