File size: 2,574 Bytes
523f6c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Data Analysis Example
Demonstrates data manipulation and visualization capabilities
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Generate sample data
np.random.seed(42)
dates = pd.date_range(start='2023-01-01', periods=365, freq='D')
values = np.cumsum(np.random.randn(365)) + 100
df = pd.DataFrame({
    'date': dates,
    'value': values,
    'category': np.random.choice(['A', 'B', 'C'], 365)
})

print("=" * 60)
print("DATA ANALYSIS EXAMPLE")
print("=" * 60)

# Basic statistics
print("\n๐Ÿ“Š Dataset Overview:")
print(f"Total records: {len(df)}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Categories: {df['category'].unique()}")

print("\n๐Ÿ“ˆ Basic Statistics:")
print(df.describe())

# Category distribution
print("\n๐Ÿ“‹ Category Distribution:")
category_counts = df['category'].value_counts()
print(category_counts)

# Time series analysis
print("\nโฐ Time Series Analysis:")
monthly_avg = df.groupby(df['date'].dt.month)['value'].mean()
print(monthly_avg)

# Create visualizations
plt.figure(figsize=(15, 10))

# Plot 1: Time series
plt.subplot(2, 2, 1)
plt.plot(df['date'], df['value'])
plt.title('Time Series Data')
plt.xlabel('Date')
plt.ylabel('Value')
plt.xticks(rotation=45)

# Plot 2: Histogram
plt.subplot(2, 2, 2)
plt.hist(df['value'], bins=30, alpha=0.7, color='skyblue')
plt.title('Value Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')

# Plot 3: Category boxplot
plt.subplot(2, 2, 3)
sns.boxplot(data=df, x='category', y='value')
plt.title('Value by Category')
plt.xlabel('Category')
plt.ylabel('Value')

# Plot 4: Scatter plot
plt.subplot(2, 2, 4)
colors = {'A': 'red', 'B': 'blue', 'C': 'green'}
for category in df['category'].unique():
    subset = df[df['category'] == category]
    plt.scatter(subset.index, subset['value'],
               c=colors[category], label=category, alpha=0.6)
plt.title('Scatter Plot by Category')
plt.xlabel('Index')
plt.ylabel('Value')
plt.legend()

plt.tight_layout()
plt.show()

# Advanced analysis
print("\n๐Ÿ” Advanced Analysis:")

# Correlation
correlation = df['value'].corr(df.index)
print(f"Correlation with time: {correlation:.4f}")

# Rolling statistics
rolling_mean = df['value'].rolling(window=30).mean()
print(f"30-day rolling mean (latest): {rolling_mean.iloc[-1]:.2f}")

# Growth rate
growth_rate = (df['value'].iloc[-1] - df['value'].iloc[0]) / df['value'].iloc[0] * 100
print(f"Total growth rate: {growth_rate:.2f}%")

print("\nโœ… Data analysis complete!")