|
|
""" |
|
|
Data Analysis Example |
|
|
Demonstrates data manipulation and visualization capabilities |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from datetime import datetime, timedelta |
|
|
|
|
|
|
|
|
np.random.seed(42) |
|
|
dates = pd.date_range(start='2023-01-01', periods=365, freq='D') |
|
|
values = np.cumsum(np.random.randn(365)) + 100 |
|
|
df = pd.DataFrame({ |
|
|
'date': dates, |
|
|
'value': values, |
|
|
'category': np.random.choice(['A', 'B', 'C'], 365) |
|
|
}) |
|
|
|
|
|
print("=" * 60) |
|
|
print("DATA ANALYSIS EXAMPLE") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print("\nπ Dataset Overview:") |
|
|
print(f"Total records: {len(df)}") |
|
|
print(f"Date range: {df['date'].min()} to {df['date'].max()}") |
|
|
print(f"Categories: {df['category'].unique()}") |
|
|
|
|
|
print("\nπ Basic Statistics:") |
|
|
print(df.describe()) |
|
|
|
|
|
|
|
|
print("\nπ Category Distribution:") |
|
|
category_counts = df['category'].value_counts() |
|
|
print(category_counts) |
|
|
|
|
|
|
|
|
print("\nβ° Time Series Analysis:") |
|
|
monthly_avg = df.groupby(df['date'].dt.month)['value'].mean() |
|
|
print(monthly_avg) |
|
|
|
|
|
|
|
|
plt.figure(figsize=(15, 10)) |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 1) |
|
|
plt.plot(df['date'], df['value']) |
|
|
plt.title('Time Series Data') |
|
|
plt.xlabel('Date') |
|
|
plt.ylabel('Value') |
|
|
plt.xticks(rotation=45) |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 2) |
|
|
plt.hist(df['value'], bins=30, alpha=0.7, color='skyblue') |
|
|
plt.title('Value Distribution') |
|
|
plt.xlabel('Value') |
|
|
plt.ylabel('Frequency') |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 3) |
|
|
sns.boxplot(data=df, x='category', y='value') |
|
|
plt.title('Value by Category') |
|
|
plt.xlabel('Category') |
|
|
plt.ylabel('Value') |
|
|
|
|
|
|
|
|
plt.subplot(2, 2, 4) |
|
|
colors = {'A': 'red', 'B': 'blue', 'C': 'green'} |
|
|
for category in df['category'].unique(): |
|
|
subset = df[df['category'] == category] |
|
|
plt.scatter(subset.index, subset['value'], |
|
|
c=colors[category], label=category, alpha=0.6) |
|
|
plt.title('Scatter Plot by Category') |
|
|
plt.xlabel('Index') |
|
|
plt.ylabel('Value') |
|
|
plt.legend() |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.show() |
|
|
|
|
|
|
|
|
print("\nπ Advanced Analysis:") |
|
|
|
|
|
|
|
|
correlation = df['value'].corr(df.index) |
|
|
print(f"Correlation with time: {correlation:.4f}") |
|
|
|
|
|
|
|
|
rolling_mean = df['value'].rolling(window=30).mean() |
|
|
print(f"30-day rolling mean (latest): {rolling_mean.iloc[-1]:.2f}") |
|
|
|
|
|
|
|
|
growth_rate = (df['value'].iloc[-1] - df['value'].iloc[0]) / df['value'].iloc[0] * 100 |
|
|
print(f"Total growth rate: {growth_rate:.2f}%") |
|
|
|
|
|
print("\nβ
Data analysis complete!") |
|
|
|