Spaces:
Configuration error
Configuration error
Upload 13 files
Browse files- __init__.py +1 -0
- ab_tester.py +319 -0
- chat_interface.py +140 -0
- data_cleaner.py +122 -0
- data_generator.py +80 -0
- eda_analyzer.py +175 -0
- ppt_generator.py +491 -0
- predictive_analytics.py +261 -0
- questionnaire_generator.py +83 -0
- sentiment_analyzer.py +298 -0
- trend_analyzer.py +286 -0
- variable_extraction.py +57 -0
- visualization_engine.py +193 -0
__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# BI Storyteller Modules
|
ab_tester.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
from scipy import stats
|
| 6 |
+
from scipy.stats import chi2_contingency, ttest_ind
|
| 7 |
+
import random
|
| 8 |
+
|
| 9 |
+
class ABTester:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
def run_test(self, df, test_name, test_metric):
|
| 14 |
+
"""Run A/B test analysis"""
|
| 15 |
+
results = {}
|
| 16 |
+
plots = []
|
| 17 |
+
|
| 18 |
+
# Generate synthetic A/B test data
|
| 19 |
+
test_data = self._generate_ab_test_data(len(df), test_metric)
|
| 20 |
+
|
| 21 |
+
# Calculate test results
|
| 22 |
+
control_data = test_data[test_data['variant'] == 'Control']
|
| 23 |
+
treatment_data = test_data[test_data['variant'] == 'Treatment']
|
| 24 |
+
|
| 25 |
+
# Basic statistics
|
| 26 |
+
control_stats = self._calculate_variant_stats(control_data, test_metric)
|
| 27 |
+
treatment_stats = self._calculate_variant_stats(treatment_data, test_metric)
|
| 28 |
+
|
| 29 |
+
results['testName'] = test_name
|
| 30 |
+
results['metric'] = test_metric
|
| 31 |
+
results['duration'] = 14 # Assume 14-day test
|
| 32 |
+
|
| 33 |
+
results['variants'] = {
|
| 34 |
+
'control': {
|
| 35 |
+
'name': 'Control Group',
|
| 36 |
+
'participants': len(control_data),
|
| 37 |
+
'conversions': control_stats['conversions'],
|
| 38 |
+
'conversionRate': control_stats['conversion_rate'],
|
| 39 |
+
'revenue': control_stats['revenue']
|
| 40 |
+
},
|
| 41 |
+
'treatment': {
|
| 42 |
+
'name': 'Treatment Group',
|
| 43 |
+
'participants': len(treatment_data),
|
| 44 |
+
'conversions': treatment_stats['conversions'],
|
| 45 |
+
'conversionRate': treatment_stats['conversion_rate'],
|
| 46 |
+
'revenue': treatment_stats['revenue']
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# Statistical significance testing
|
| 51 |
+
statistical_results = self._perform_statistical_tests(control_data, treatment_data, test_metric)
|
| 52 |
+
results['statistics'] = statistical_results
|
| 53 |
+
|
| 54 |
+
# Generate insights
|
| 55 |
+
results['insights'] = self._generate_insights(results)
|
| 56 |
+
|
| 57 |
+
# Create visualizations
|
| 58 |
+
plots = self._create_ab_test_plots(test_data, results)
|
| 59 |
+
|
| 60 |
+
return results, plots
|
| 61 |
+
|
| 62 |
+
def _generate_ab_test_data(self, n_samples, test_metric):
|
| 63 |
+
"""Generate synthetic A/B test data"""
|
| 64 |
+
np.random.seed(42)
|
| 65 |
+
|
| 66 |
+
# Split into control and treatment groups
|
| 67 |
+
n_control = n_samples // 2
|
| 68 |
+
n_treatment = n_samples - n_control
|
| 69 |
+
|
| 70 |
+
data = []
|
| 71 |
+
|
| 72 |
+
# Control group (baseline performance)
|
| 73 |
+
if test_metric == 'Conversion Rate':
|
| 74 |
+
control_conversion_rate = 0.076 # 7.6% baseline
|
| 75 |
+
treatment_conversion_rate = 0.092 # 9.2% improved
|
| 76 |
+
elif test_metric == 'Click Rate':
|
| 77 |
+
control_conversion_rate = 0.12
|
| 78 |
+
treatment_conversion_rate = 0.15
|
| 79 |
+
else: # Revenue
|
| 80 |
+
control_conversion_rate = 0.08
|
| 81 |
+
treatment_conversion_rate = 0.10
|
| 82 |
+
|
| 83 |
+
# Generate control group data
|
| 84 |
+
for i in range(n_control):
|
| 85 |
+
converted = np.random.random() < control_conversion_rate
|
| 86 |
+
revenue = np.random.normal(45, 15) if converted else 0
|
| 87 |
+
data.append({
|
| 88 |
+
'user_id': f'control_{i}',
|
| 89 |
+
'variant': 'Control',
|
| 90 |
+
'converted': converted,
|
| 91 |
+
'revenue': max(0, revenue)
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
# Generate treatment group data
|
| 95 |
+
for i in range(n_treatment):
|
| 96 |
+
converted = np.random.random() < treatment_conversion_rate
|
| 97 |
+
revenue = np.random.normal(52, 18) if converted else 0
|
| 98 |
+
data.append({
|
| 99 |
+
'user_id': f'treatment_{i}',
|
| 100 |
+
'variant': 'Treatment',
|
| 101 |
+
'converted': converted,
|
| 102 |
+
'revenue': max(0, revenue)
|
| 103 |
+
})
|
| 104 |
+
|
| 105 |
+
return pd.DataFrame(data)
|
| 106 |
+
|
| 107 |
+
def _calculate_variant_stats(self, variant_data, test_metric):
|
| 108 |
+
"""Calculate statistics for a variant"""
|
| 109 |
+
total_users = len(variant_data)
|
| 110 |
+
conversions = variant_data['converted'].sum()
|
| 111 |
+
conversion_rate = conversions / total_users if total_users > 0 else 0
|
| 112 |
+
total_revenue = variant_data['revenue'].sum()
|
| 113 |
+
|
| 114 |
+
return {
|
| 115 |
+
'conversions': int(conversions),
|
| 116 |
+
'conversion_rate': round(conversion_rate, 4),
|
| 117 |
+
'revenue': round(total_revenue, 2)
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
def _perform_statistical_tests(self, control_data, treatment_data, test_metric):
|
| 121 |
+
"""Perform statistical significance tests"""
|
| 122 |
+
# Conversion rate test (Chi-square or Z-test)
|
| 123 |
+
control_conversions = control_data['converted'].sum()
|
| 124 |
+
control_total = len(control_data)
|
| 125 |
+
treatment_conversions = treatment_data['converted'].sum()
|
| 126 |
+
treatment_total = len(treatment_data)
|
| 127 |
+
|
| 128 |
+
# Z-test for proportions
|
| 129 |
+
p1 = control_conversions / control_total
|
| 130 |
+
p2 = treatment_conversions / treatment_total
|
| 131 |
+
|
| 132 |
+
# Pooled proportion
|
| 133 |
+
p_pool = (control_conversions + treatment_conversions) / (control_total + treatment_total)
|
| 134 |
+
|
| 135 |
+
# Standard error
|
| 136 |
+
se = np.sqrt(p_pool * (1 - p_pool) * (1/control_total + 1/treatment_total))
|
| 137 |
+
|
| 138 |
+
# Z-score
|
| 139 |
+
if se > 0:
|
| 140 |
+
z_score = (p2 - p1) / se
|
| 141 |
+
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
|
| 142 |
+
else:
|
| 143 |
+
z_score = 0
|
| 144 |
+
p_value = 1.0
|
| 145 |
+
|
| 146 |
+
# Effect size (relative uplift)
|
| 147 |
+
if p1 > 0:
|
| 148 |
+
uplift = ((p2 - p1) / p1) * 100
|
| 149 |
+
else:
|
| 150 |
+
uplift = 0
|
| 151 |
+
|
| 152 |
+
# Determine significance
|
| 153 |
+
significance = p_value < 0.05
|
| 154 |
+
confidence = 1 - p_value
|
| 155 |
+
|
| 156 |
+
# Determine winner
|
| 157 |
+
winner = 'treatment' if p2 > p1 and significance else 'control'
|
| 158 |
+
|
| 159 |
+
return {
|
| 160 |
+
'pValue': round(p_value, 4),
|
| 161 |
+
'confidence': round(confidence, 3),
|
| 162 |
+
'significance': significance,
|
| 163 |
+
'uplift': round(uplift, 1),
|
| 164 |
+
'winner': winner,
|
| 165 |
+
'zScore': round(z_score, 3)
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
def _generate_insights(self, results):
|
| 169 |
+
"""Generate insights from A/B test results"""
|
| 170 |
+
insights = []
|
| 171 |
+
|
| 172 |
+
stats = results['statistics']
|
| 173 |
+
control = results['variants']['control']
|
| 174 |
+
treatment = results['variants']['treatment']
|
| 175 |
+
|
| 176 |
+
# Uplift insight
|
| 177 |
+
if stats['significance']:
|
| 178 |
+
insights.append(f"Treatment shows {stats['uplift']}% improvement in {results['metric'].lower()}")
|
| 179 |
+
insights.append(f"Results are statistically significant (p = {stats['pValue']})")
|
| 180 |
+
else:
|
| 181 |
+
insights.append(f"No statistically significant difference detected (p = {stats['pValue']})")
|
| 182 |
+
|
| 183 |
+
# Revenue impact
|
| 184 |
+
revenue_diff = treatment['revenue'] - control['revenue']
|
| 185 |
+
if revenue_diff > 0:
|
| 186 |
+
insights.append(f"Revenue increase of ${revenue_diff:,.2f} over test period")
|
| 187 |
+
|
| 188 |
+
# Recommendation
|
| 189 |
+
if stats['significance'] and stats['winner'] == 'treatment':
|
| 190 |
+
insights.append("Recommend implementing treatment for full campaign")
|
| 191 |
+
elif stats['significance'] and stats['winner'] == 'control':
|
| 192 |
+
insights.append("Control performs better - continue with current approach")
|
| 193 |
+
else:
|
| 194 |
+
insights.append("Extend test duration or increase sample size for conclusive results")
|
| 195 |
+
|
| 196 |
+
return insights
|
| 197 |
+
|
| 198 |
+
def _create_ab_test_plots(self, test_data, results):
|
| 199 |
+
"""Create A/B test visualization plots"""
|
| 200 |
+
plots = []
|
| 201 |
+
|
| 202 |
+
# Conversion rate comparison
|
| 203 |
+
plt.figure(figsize=(10, 6))
|
| 204 |
+
variants = ['Control', 'Treatment']
|
| 205 |
+
conversion_rates = [
|
| 206 |
+
results['variants']['control']['conversionRate'],
|
| 207 |
+
results['variants']['treatment']['conversionRate']
|
| 208 |
+
]
|
| 209 |
+
colors = ['#3498db', '#e74c3c']
|
| 210 |
+
|
| 211 |
+
bars = plt.bar(variants, conversion_rates, color=colors, alpha=0.7)
|
| 212 |
+
plt.title('Conversion Rate Comparison')
|
| 213 |
+
plt.ylabel('Conversion Rate')
|
| 214 |
+
plt.ylim(0, max(conversion_rates) * 1.2)
|
| 215 |
+
|
| 216 |
+
# Add value labels on bars
|
| 217 |
+
for bar, rate in zip(bars, conversion_rates):
|
| 218 |
+
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
|
| 219 |
+
f'{rate:.1%}', ha='center', va='bottom', fontweight='bold')
|
| 220 |
+
|
| 221 |
+
plt.tight_layout()
|
| 222 |
+
plt.savefig('conversion_rate_comparison.png', dpi=300, bbox_inches='tight')
|
| 223 |
+
plots.append('conversion_rate_comparison.png')
|
| 224 |
+
plt.close()
|
| 225 |
+
|
| 226 |
+
# Revenue comparison
|
| 227 |
+
plt.figure(figsize=(10, 6))
|
| 228 |
+
revenues = [
|
| 229 |
+
results['variants']['control']['revenue'],
|
| 230 |
+
results['variants']['treatment']['revenue']
|
| 231 |
+
]
|
| 232 |
+
|
| 233 |
+
bars = plt.bar(variants, revenues, color=colors, alpha=0.7)
|
| 234 |
+
plt.title('Revenue Comparison')
|
| 235 |
+
plt.ylabel('Total Revenue ($)')
|
| 236 |
+
|
| 237 |
+
# Add value labels on bars
|
| 238 |
+
for bar, revenue in zip(bars, revenues):
|
| 239 |
+
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(revenues)*0.01,
|
| 240 |
+
f'${revenue:,.0f}', ha='center', va='bottom', fontweight='bold')
|
| 241 |
+
|
| 242 |
+
plt.tight_layout()
|
| 243 |
+
plt.savefig('revenue_comparison.png', dpi=300, bbox_inches='tight')
|
| 244 |
+
plots.append('revenue_comparison.png')
|
| 245 |
+
plt.close()
|
| 246 |
+
|
| 247 |
+
# Statistical significance visualization
|
| 248 |
+
plt.figure(figsize=(12, 8))
|
| 249 |
+
|
| 250 |
+
# Create subplots for different metrics
|
| 251 |
+
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
|
| 252 |
+
|
| 253 |
+
# Participants
|
| 254 |
+
participants = [results['variants']['control']['participants'],
|
| 255 |
+
results['variants']['treatment']['participants']]
|
| 256 |
+
ax1.bar(variants, participants, color=colors, alpha=0.7)
|
| 257 |
+
ax1.set_title('Participants')
|
| 258 |
+
ax1.set_ylabel('Count')
|
| 259 |
+
|
| 260 |
+
# Conversions
|
| 261 |
+
conversions = [results['variants']['control']['conversions'],
|
| 262 |
+
results['variants']['treatment']['conversions']]
|
| 263 |
+
ax2.bar(variants, conversions, color=colors, alpha=0.7)
|
| 264 |
+
ax2.set_title('Total Conversions')
|
| 265 |
+
ax2.set_ylabel('Count')
|
| 266 |
+
|
| 267 |
+
# P-value visualization
|
| 268 |
+
ax3.bar(['P-Value'], [results['statistics']['pValue']], color='orange', alpha=0.7)
|
| 269 |
+
ax3.axhline(y=0.05, color='red', linestyle='--', label='Significance Threshold')
|
| 270 |
+
ax3.set_title('Statistical Significance')
|
| 271 |
+
ax3.set_ylabel('P-Value')
|
| 272 |
+
ax3.legend()
|
| 273 |
+
|
| 274 |
+
# Confidence level
|
| 275 |
+
ax4.bar(['Confidence'], [results['statistics']['confidence']], color='green', alpha=0.7)
|
| 276 |
+
ax4.set_title('Confidence Level')
|
| 277 |
+
ax4.set_ylabel('Confidence')
|
| 278 |
+
ax4.set_ylim(0, 1)
|
| 279 |
+
|
| 280 |
+
plt.tight_layout()
|
| 281 |
+
plt.savefig('ab_test_summary.png', dpi=300, bbox_inches='tight')
|
| 282 |
+
plots.append('ab_test_summary.png')
|
| 283 |
+
plt.close()
|
| 284 |
+
|
| 285 |
+
return plots
|
| 286 |
+
|
| 287 |
+
def calculate_sample_size(self, baseline_rate, minimum_effect, alpha=0.05, power=0.8):
|
| 288 |
+
"""Calculate required sample size for A/B test"""
|
| 289 |
+
from scipy.stats import norm
|
| 290 |
+
|
| 291 |
+
# Convert percentages to proportions
|
| 292 |
+
p1 = baseline_rate
|
| 293 |
+
p2 = baseline_rate * (1 + minimum_effect)
|
| 294 |
+
|
| 295 |
+
# Calculate pooled proportion
|
| 296 |
+
p_avg = (p1 + p2) / 2
|
| 297 |
+
|
| 298 |
+
# Calculate effect size
|
| 299 |
+
effect_size = abs(p2 - p1)
|
| 300 |
+
|
| 301 |
+
# Calculate sample size
|
| 302 |
+
z_alpha = norm.ppf(1 - alpha/2)
|
| 303 |
+
z_beta = norm.ppf(power)
|
| 304 |
+
|
| 305 |
+
numerator = (z_alpha * np.sqrt(2 * p_avg * (1 - p_avg)) +
|
| 306 |
+
z_beta * np.sqrt(p1 * (1 - p1) + p2 * (1 - p2)))**2
|
| 307 |
+
denominator = effect_size**2
|
| 308 |
+
|
| 309 |
+
sample_size_per_group = int(np.ceil(numerator / denominator))
|
| 310 |
+
|
| 311 |
+
return {
|
| 312 |
+
'sample_size_per_group': sample_size_per_group,
|
| 313 |
+
'total_sample_size': sample_size_per_group * 2,
|
| 314 |
+
'baseline_rate': p1,
|
| 315 |
+
'target_rate': p2,
|
| 316 |
+
'minimum_effect': minimum_effect,
|
| 317 |
+
'alpha': alpha,
|
| 318 |
+
'power': power
|
| 319 |
+
}
|
chat_interface.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from groq import Groq
|
| 3 |
+
|
| 4 |
+
class ChatInterface:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.client = None
|
| 7 |
+
self.conversation_history = []
|
| 8 |
+
|
| 9 |
+
def set_api_key(self, api_key):
|
| 10 |
+
"""Set Groq API key"""
|
| 11 |
+
self.client = Groq(api_key=api_key)
|
| 12 |
+
|
| 13 |
+
def chat(self, message, project_data):
|
| 14 |
+
"""Chat with data using AI"""
|
| 15 |
+
if not self.client:
|
| 16 |
+
return self._get_mock_response(message)
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
# Prepare data context
|
| 20 |
+
data_context = self._prepare_data_context(project_data)
|
| 21 |
+
|
| 22 |
+
system_prompt = """You are a data analyst assistant helping with marketing analysis. Answer questions about the user's data and provide insights. Be concise and actionable. Use the provided data context to give specific, data-driven responses."""
|
| 23 |
+
|
| 24 |
+
user_prompt = f"""Data Context: {data_context}
|
| 25 |
+
|
| 26 |
+
User Question: {message}
|
| 27 |
+
|
| 28 |
+
Provide a helpful, data-driven response based on the available analysis results. Include specific numbers and insights where relevant."""
|
| 29 |
+
|
| 30 |
+
completion = self.client.chat.completions.create(
|
| 31 |
+
messages=[
|
| 32 |
+
{"role": "system", "content": system_prompt},
|
| 33 |
+
{"role": "user", "content": user_prompt}
|
| 34 |
+
],
|
| 35 |
+
model="llama-3.1-70b-versatile",
|
| 36 |
+
temperature=0.7,
|
| 37 |
+
max_tokens=1024
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
response = completion.choices[0].message.content.strip()
|
| 41 |
+
|
| 42 |
+
# Store conversation
|
| 43 |
+
self.conversation_history.append({
|
| 44 |
+
'user': message,
|
| 45 |
+
'assistant': response,
|
| 46 |
+
'timestamp': pd.Timestamp.now()
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
return response
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Error in chat: {e}")
|
| 53 |
+
return self._get_mock_response(message)
|
| 54 |
+
|
| 55 |
+
def _prepare_data_context(self, project_data):
|
| 56 |
+
"""Prepare data context for AI"""
|
| 57 |
+
context = {}
|
| 58 |
+
|
| 59 |
+
# Variables
|
| 60 |
+
if 'variables' in project_data:
|
| 61 |
+
context['variables'] = project_data['variables']
|
| 62 |
+
|
| 63 |
+
# EDA Results
|
| 64 |
+
if 'eda_results' in project_data:
|
| 65 |
+
eda = project_data['eda_results']
|
| 66 |
+
context['eda_summary'] = {
|
| 67 |
+
'total_records': eda.get('summary', {}).get('total_records'),
|
| 68 |
+
'correlations': len(eda.get('correlations', [])),
|
| 69 |
+
'key_insights': eda.get('insights', [])[:3] # Top 3 insights
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# Model Results
|
| 73 |
+
if 'model_results' in project_data:
|
| 74 |
+
model = project_data['model_results']
|
| 75 |
+
context['model_performance'] = {
|
| 76 |
+
'accuracy': model.get('accuracy'),
|
| 77 |
+
'model_type': model.get('model_type'),
|
| 78 |
+
'top_features': model.get('feature_importance', [])[:3]
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# Trend Results
|
| 82 |
+
if 'trend_results' in project_data:
|
| 83 |
+
trends = project_data['trend_results']
|
| 84 |
+
context['trends'] = {
|
| 85 |
+
'timeframe': trends.get('timeframe'),
|
| 86 |
+
'key_trends': [t['metric'] + ': ' + t['direction'] for t in trends.get('trends', [])[:3]]
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
# Sentiment Results
|
| 90 |
+
if 'sentiment_results' in project_data:
|
| 91 |
+
sentiment = project_data['sentiment_results']
|
| 92 |
+
context['sentiment'] = {
|
| 93 |
+
'overall_positive': sentiment.get('overall', {}).get('positive'),
|
| 94 |
+
'recommendations': sentiment.get('recommendations', [])[:2]
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# A/B Test Results
|
| 98 |
+
if 'ab_test_results' in project_data:
|
| 99 |
+
ab_test = project_data['ab_test_results']
|
| 100 |
+
context['ab_test'] = {
|
| 101 |
+
'winner': ab_test.get('statistics', {}).get('winner'),
|
| 102 |
+
'uplift': ab_test.get('statistics', {}).get('uplift'),
|
| 103 |
+
'significance': ab_test.get('statistics', {}).get('significance')
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
return json.dumps(context, indent=2)
|
| 107 |
+
|
| 108 |
+
def _get_mock_response(self, message):
|
| 109 |
+
"""Generate mock response when AI is not available"""
|
| 110 |
+
message_lower = message.lower()
|
| 111 |
+
|
| 112 |
+
if 'customer' in message_lower and 'satisfaction' in message_lower:
|
| 113 |
+
return "Based on your sentiment analysis, customer satisfaction is at 68% positive. Key drivers include product quality and delivery speed. Consider focusing on the 10% negative feedback to improve overall satisfaction."
|
| 114 |
+
|
| 115 |
+
elif 'marketing' in message_lower and 'channel' in message_lower:
|
| 116 |
+
return "Your data shows that Email and Social Media are the top-performing marketing channels with higher conversion rates. TV and Print show lower engagement. Consider reallocating budget to digital channels for better ROI."
|
| 117 |
+
|
| 118 |
+
elif 'revenue' in message_lower or 'forecast' in message_lower:
|
| 119 |
+
return "Based on trend analysis, revenue is projected to grow by 15.3% next quarter. The forecast shows $267.3K with 78% confidence. Key growth drivers include customer acquisition and increased purchase frequency."
|
| 120 |
+
|
| 121 |
+
elif 'correlation' in message_lower or 'relationship' in message_lower:
|
| 122 |
+
return "Strong correlations detected between Customer Age and Purchase Amount (0.65), and between Satisfaction Score and Purchase Frequency (0.58). These relationships suggest targeting strategies based on age demographics."
|
| 123 |
+
|
| 124 |
+
elif 'segment' in message_lower or 'group' in message_lower:
|
| 125 |
+
return "Your predictive model identifies three customer segments: High-value (73%), Medium-value (21%), and Low-value (6%). Focus retention efforts on high-value customers and conversion strategies for medium-value segments."
|
| 126 |
+
|
| 127 |
+
elif 'test' in message_lower or 'experiment' in message_lower:
|
| 128 |
+
return "Your A/B test shows the treatment variant performs 21.1% better than control with statistical significance (p=0.023). Recommend implementing the treatment for your full campaign to capture the revenue uplift."
|
| 129 |
+
|
| 130 |
+
else:
|
| 131 |
+
return "I can help you analyze your marketing data. Ask me about customer segments, marketing channel performance, revenue forecasts, correlations, or A/B test results. What specific insights would you like to explore?"
|
| 132 |
+
|
| 133 |
+
def get_conversation_history(self):
|
| 134 |
+
"""Get chat conversation history"""
|
| 135 |
+
return self.conversation_history
|
| 136 |
+
|
| 137 |
+
def clear_history(self):
|
| 138 |
+
"""Clear conversation history"""
|
| 139 |
+
self.conversation_history = []
|
| 140 |
+
return "Conversation history cleared."
|
data_cleaner.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.preprocessing import StandardScaler
|
| 4 |
+
from scipy import stats
|
| 5 |
+
|
| 6 |
+
class DataCleaner:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
def clean(self, df):
|
| 11 |
+
"""Comprehensive data cleaning"""
|
| 12 |
+
results = {
|
| 13 |
+
'original_count': len(df),
|
| 14 |
+
'missing_values': {},
|
| 15 |
+
'duplicates': 0,
|
| 16 |
+
'outliers': {},
|
| 17 |
+
'cleaned_count': 0
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Make a copy
|
| 21 |
+
cleaned_df = df.copy()
|
| 22 |
+
|
| 23 |
+
# 1. Handle missing values
|
| 24 |
+
missing_counts = cleaned_df.isnull().sum()
|
| 25 |
+
results['missing_values'] = missing_counts.to_dict()
|
| 26 |
+
|
| 27 |
+
# Fill missing values
|
| 28 |
+
for column in cleaned_df.columns:
|
| 29 |
+
if cleaned_df[column].dtype in ['int64', 'float64']:
|
| 30 |
+
cleaned_df[column].fillna(cleaned_df[column].median(), inplace=True)
|
| 31 |
+
else:
|
| 32 |
+
cleaned_df[column].fillna(cleaned_df[column].mode()[0], inplace=True)
|
| 33 |
+
|
| 34 |
+
# 2. Remove duplicates
|
| 35 |
+
duplicates_count = cleaned_df.duplicated().sum()
|
| 36 |
+
results['duplicates'] = duplicates_count
|
| 37 |
+
cleaned_df = cleaned_df.drop_duplicates()
|
| 38 |
+
|
| 39 |
+
# 3. Handle outliers (for numeric columns)
|
| 40 |
+
outlier_counts = {}
|
| 41 |
+
for column in cleaned_df.select_dtypes(include=[np.number]).columns:
|
| 42 |
+
if column != 'ID':
|
| 43 |
+
Q1 = cleaned_df[column].quantile(0.25)
|
| 44 |
+
Q3 = cleaned_df[column].quantile(0.75)
|
| 45 |
+
IQR = Q3 - Q1
|
| 46 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 47 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 48 |
+
|
| 49 |
+
outliers = cleaned_df[(cleaned_df[column] < lower_bound) |
|
| 50 |
+
(cleaned_df[column] > upper_bound)]
|
| 51 |
+
outlier_counts[column] = len(outliers)
|
| 52 |
+
|
| 53 |
+
# Remove outliers
|
| 54 |
+
cleaned_df = cleaned_df[(cleaned_df[column] >= lower_bound) &
|
| 55 |
+
(cleaned_df[column] <= upper_bound)]
|
| 56 |
+
|
| 57 |
+
results['outliers'] = outlier_counts
|
| 58 |
+
results['cleaned_count'] = len(cleaned_df)
|
| 59 |
+
|
| 60 |
+
# 4. Data type optimization
|
| 61 |
+
cleaned_df = self._optimize_dtypes(cleaned_df)
|
| 62 |
+
|
| 63 |
+
# 5. Feature engineering
|
| 64 |
+
cleaned_df = self._engineer_features(cleaned_df)
|
| 65 |
+
|
| 66 |
+
return cleaned_df, results
|
| 67 |
+
|
| 68 |
+
def _optimize_dtypes(self, df):
|
| 69 |
+
"""Optimize data types for memory efficiency"""
|
| 70 |
+
optimized_df = df.copy()
|
| 71 |
+
|
| 72 |
+
for column in optimized_df.columns:
|
| 73 |
+
if optimized_df[column].dtype == 'int64':
|
| 74 |
+
if optimized_df[column].min() >= 0:
|
| 75 |
+
if optimized_df[column].max() < 255:
|
| 76 |
+
optimized_df[column] = optimized_df[column].astype('uint8')
|
| 77 |
+
elif optimized_df[column].max() < 65535:
|
| 78 |
+
optimized_df[column] = optimized_df[column].astype('uint16')
|
| 79 |
+
else:
|
| 80 |
+
optimized_df[column] = optimized_df[column].astype('uint32')
|
| 81 |
+
|
| 82 |
+
return optimized_df
|
| 83 |
+
|
| 84 |
+
def _engineer_features(self, df):
|
| 85 |
+
"""Create additional features"""
|
| 86 |
+
engineered_df = df.copy()
|
| 87 |
+
|
| 88 |
+
# Add customer value segments if purchase amount exists
|
| 89 |
+
amount_cols = [col for col in df.columns if 'amount' in col.lower()]
|
| 90 |
+
if amount_cols:
|
| 91 |
+
amount_col = amount_cols[0]
|
| 92 |
+
engineered_df['Value_Segment'] = pd.cut(
|
| 93 |
+
engineered_df[amount_col],
|
| 94 |
+
bins=3,
|
| 95 |
+
labels=['Low', 'Medium', 'High']
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
# Add age groups if age exists
|
| 99 |
+
age_cols = [col for col in df.columns if 'age' in col.lower()]
|
| 100 |
+
if age_cols:
|
| 101 |
+
age_col = age_cols[0]
|
| 102 |
+
engineered_df['Age_Group'] = pd.cut(
|
| 103 |
+
engineered_df[age_col],
|
| 104 |
+
bins=[0, 25, 35, 50, 100],
|
| 105 |
+
labels=['Young', 'Adult', 'Middle-aged', 'Senior']
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
return engineered_df
|
| 109 |
+
|
| 110 |
+
def balance_data(self, df, target_column):
|
| 111 |
+
"""Balance dataset for classification tasks"""
|
| 112 |
+
if target_column not in df.columns:
|
| 113 |
+
return df
|
| 114 |
+
|
| 115 |
+
# Simple undersampling for balance
|
| 116 |
+
min_class_size = df[target_column].value_counts().min()
|
| 117 |
+
|
| 118 |
+
balanced_df = df.groupby(target_column).apply(
|
| 119 |
+
lambda x: x.sample(min_class_size, random_state=42)
|
| 120 |
+
).reset_index(drop=True)
|
| 121 |
+
|
| 122 |
+
return balanced_df
|
data_generator.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
import random
|
| 5 |
+
|
| 6 |
+
class DataGenerator:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.full_dataset = None
|
| 9 |
+
|
| 10 |
+
def generate(self, variables, sample_size):
|
| 11 |
+
"""Generate sample data based on variables"""
|
| 12 |
+
np.random.seed(42) # For reproducibility
|
| 13 |
+
|
| 14 |
+
data = {}
|
| 15 |
+
|
| 16 |
+
# Generate data for each variable
|
| 17 |
+
for variable in variables:
|
| 18 |
+
if 'age' in variable.lower():
|
| 19 |
+
data[variable] = np.random.normal(35, 12, sample_size).astype(int)
|
| 20 |
+
data[variable] = np.clip(data[variable], 18, 80)
|
| 21 |
+
|
| 22 |
+
elif 'amount' in variable.lower() or 'price' in variable.lower():
|
| 23 |
+
data[variable] = np.random.lognormal(4, 1, sample_size)
|
| 24 |
+
data[variable] = np.round(data[variable], 2)
|
| 25 |
+
|
| 26 |
+
elif 'category' in variable.lower():
|
| 27 |
+
categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books']
|
| 28 |
+
data[variable] = np.random.choice(categories, sample_size)
|
| 29 |
+
|
| 30 |
+
elif 'channel' in variable.lower():
|
| 31 |
+
channels = ['Email', 'Social Media', 'TV', 'Print', 'Online', 'Direct']
|
| 32 |
+
data[variable] = np.random.choice(channels, sample_size)
|
| 33 |
+
|
| 34 |
+
elif 'location' in variable.lower():
|
| 35 |
+
locations = ['Urban', 'Suburban', 'Rural']
|
| 36 |
+
data[variable] = np.random.choice(locations, sample_size)
|
| 37 |
+
|
| 38 |
+
elif 'frequency' in variable.lower():
|
| 39 |
+
data[variable] = np.random.poisson(3, sample_size) + 1
|
| 40 |
+
|
| 41 |
+
elif 'satisfaction' in variable.lower() or 'score' in variable.lower():
|
| 42 |
+
data[variable] = np.random.choice([1, 2, 3, 4, 5], sample_size,
|
| 43 |
+
p=[0.05, 0.1, 0.2, 0.4, 0.25])
|
| 44 |
+
|
| 45 |
+
elif 'time' in variable.lower():
|
| 46 |
+
data[variable] = np.random.exponential(7, sample_size).astype(int) + 1
|
| 47 |
+
|
| 48 |
+
else:
|
| 49 |
+
# Default to numeric data
|
| 50 |
+
data[variable] = np.random.normal(50, 15, sample_size)
|
| 51 |
+
|
| 52 |
+
# Add ID column
|
| 53 |
+
data['ID'] = range(1, sample_size + 1)
|
| 54 |
+
|
| 55 |
+
# Create DataFrame
|
| 56 |
+
df = pd.DataFrame(data)
|
| 57 |
+
|
| 58 |
+
# Store full dataset
|
| 59 |
+
self.full_dataset = df
|
| 60 |
+
|
| 61 |
+
return df
|
| 62 |
+
|
| 63 |
+
def get_full_dataset(self):
|
| 64 |
+
"""Return the full generated dataset"""
|
| 65 |
+
return self.full_dataset
|
| 66 |
+
|
| 67 |
+
def add_missing_values(self, df, missing_rate=0.05):
|
| 68 |
+
"""Add missing values to simulate real data"""
|
| 69 |
+
df_with_missing = df.copy()
|
| 70 |
+
|
| 71 |
+
for column in df.columns:
|
| 72 |
+
if column != 'ID':
|
| 73 |
+
missing_indices = np.random.choice(
|
| 74 |
+
df.index,
|
| 75 |
+
size=int(len(df) * missing_rate),
|
| 76 |
+
replace=False
|
| 77 |
+
)
|
| 78 |
+
df_with_missing.loc[missing_indices, column] = np.nan
|
| 79 |
+
|
| 80 |
+
return df_with_missing
|
eda_analyzer.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import plotly.express as px
|
| 6 |
+
import plotly.graph_objects as go
|
| 7 |
+
from scipy.stats import pearsonr, spearmanr
|
| 8 |
+
import json
|
| 9 |
+
from groq import Groq
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
class EDAAnalyzer:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.client = None
|
| 15 |
+
plt.style.use('seaborn-v0_8')
|
| 16 |
+
|
| 17 |
+
def set_api_key(self, api_key):
|
| 18 |
+
"""Set Groq API key"""
|
| 19 |
+
self.client = Groq(api_key=api_key)
|
| 20 |
+
|
| 21 |
+
def analyze(self, df):
|
| 22 |
+
"""Perform comprehensive EDA"""
|
| 23 |
+
results = {}
|
| 24 |
+
plots = []
|
| 25 |
+
|
| 26 |
+
# Basic statistics
|
| 27 |
+
results['summary'] = {
|
| 28 |
+
'total_records': len(df),
|
| 29 |
+
'total_features': len(df.columns),
|
| 30 |
+
'numerical_features': len(df.select_dtypes(include=[np.number]).columns),
|
| 31 |
+
'categorical_features': len(df.select_dtypes(include=['object', 'category']).columns),
|
| 32 |
+
'missing_values': df.isnull().sum().sum()
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
# Correlation analysis
|
| 36 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
| 37 |
+
if len(numeric_df.columns) > 1:
|
| 38 |
+
correlation_matrix = numeric_df.corr()
|
| 39 |
+
results['correlations'] = self._extract_strong_correlations(correlation_matrix)
|
| 40 |
+
|
| 41 |
+
# Create correlation heatmap
|
| 42 |
+
plt.figure(figsize=(10, 8))
|
| 43 |
+
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
|
| 44 |
+
plt.title('Feature Correlation Matrix')
|
| 45 |
+
plt.tight_layout()
|
| 46 |
+
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
|
| 47 |
+
plots.append('correlation_heatmap.png')
|
| 48 |
+
plt.close()
|
| 49 |
+
|
| 50 |
+
# Distribution analysis
|
| 51 |
+
results['distributions'] = {}
|
| 52 |
+
for column in numeric_df.columns:
|
| 53 |
+
if column != 'ID':
|
| 54 |
+
stats = {
|
| 55 |
+
'mean': round(numeric_df[column].mean(), 2),
|
| 56 |
+
'std': round(numeric_df[column].std(), 2),
|
| 57 |
+
'min': round(numeric_df[column].min(), 2),
|
| 58 |
+
'max': round(numeric_df[column].max(), 2),
|
| 59 |
+
'median': round(numeric_df[column].median(), 2),
|
| 60 |
+
'skewness': round(numeric_df[column].skew(), 2)
|
| 61 |
+
}
|
| 62 |
+
results['distributions'][column] = stats
|
| 63 |
+
|
| 64 |
+
# Create distribution plot
|
| 65 |
+
plt.figure(figsize=(10, 6))
|
| 66 |
+
plt.subplot(1, 2, 1)
|
| 67 |
+
plt.hist(numeric_df[column], bins=30, alpha=0.7, edgecolor='black')
|
| 68 |
+
plt.title(f'{column} Distribution')
|
| 69 |
+
plt.xlabel(column)
|
| 70 |
+
plt.ylabel('Frequency')
|
| 71 |
+
|
| 72 |
+
plt.subplot(1, 2, 2)
|
| 73 |
+
plt.boxplot(numeric_df[column])
|
| 74 |
+
plt.title(f'{column} Box Plot')
|
| 75 |
+
plt.ylabel(column)
|
| 76 |
+
|
| 77 |
+
plt.tight_layout()
|
| 78 |
+
plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
|
| 79 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 80 |
+
plots.append(plot_name)
|
| 81 |
+
plt.close()
|
| 82 |
+
|
| 83 |
+
# Categorical analysis
|
| 84 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 85 |
+
for column in categorical_cols:
|
| 86 |
+
if column != 'ID':
|
| 87 |
+
value_counts = df[column].value_counts()
|
| 88 |
+
|
| 89 |
+
# Create bar plot
|
| 90 |
+
plt.figure(figsize=(10, 6))
|
| 91 |
+
value_counts.plot(kind='bar')
|
| 92 |
+
plt.title(f'{column} Distribution')
|
| 93 |
+
plt.xlabel(column)
|
| 94 |
+
plt.ylabel('Count')
|
| 95 |
+
plt.xticks(rotation=45)
|
| 96 |
+
plt.tight_layout()
|
| 97 |
+
plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
|
| 98 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 99 |
+
plots.append(plot_name)
|
| 100 |
+
plt.close()
|
| 101 |
+
|
| 102 |
+
# Generate AI insights
|
| 103 |
+
results['insights'] = self._generate_insights(df, results)
|
| 104 |
+
|
| 105 |
+
return results, plots
|
| 106 |
+
|
| 107 |
+
def _extract_strong_correlations(self, corr_matrix, threshold=0.5):
|
| 108 |
+
"""Extract correlations above threshold"""
|
| 109 |
+
strong_correlations = []
|
| 110 |
+
|
| 111 |
+
for i in range(len(corr_matrix.columns)):
|
| 112 |
+
for j in range(i+1, len(corr_matrix.columns)):
|
| 113 |
+
corr_value = corr_matrix.iloc[i, j]
|
| 114 |
+
if abs(corr_value) >= threshold:
|
| 115 |
+
strong_correlations.append({
|
| 116 |
+
'var1': corr_matrix.columns[i],
|
| 117 |
+
'var2': corr_matrix.columns[j],
|
| 118 |
+
'correlation': round(corr_value, 3)
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
return strong_correlations
|
| 122 |
+
|
| 123 |
+
def _generate_insights(self, df, results):
|
| 124 |
+
"""Generate AI-powered insights"""
|
| 125 |
+
if not self.client:
|
| 126 |
+
return self._get_mock_insights()
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
# Prepare data summary for AI
|
| 130 |
+
data_summary = {
|
| 131 |
+
'columns': list(df.columns),
|
| 132 |
+
'shape': df.shape,
|
| 133 |
+
'correlations': results.get('correlations', []),
|
| 134 |
+
'distributions': results.get('distributions', {})
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
system_prompt = """You are a data scientist analyzing marketing data. Generate 3-5 key insights based on the data summary provided. Focus on actionable business insights."""
|
| 138 |
+
|
| 139 |
+
user_prompt = f"""Data Summary: {json.dumps(data_summary, indent=2)}
|
| 140 |
+
|
| 141 |
+
Generate key insights about this marketing dataset. Focus on:
|
| 142 |
+
1. Customer behavior patterns
|
| 143 |
+
2. Important correlations
|
| 144 |
+
3. Distribution characteristics
|
| 145 |
+
4. Business implications
|
| 146 |
+
|
| 147 |
+
Return insights as a JSON array of strings."""
|
| 148 |
+
|
| 149 |
+
completion = self.client.chat.completions.create(
|
| 150 |
+
messages=[
|
| 151 |
+
{"role": "system", "content": system_prompt},
|
| 152 |
+
{"role": "user", "content": user_prompt}
|
| 153 |
+
],
|
| 154 |
+
model="llama-3.1-70b-versatile",
|
| 155 |
+
temperature=0.7,
|
| 156 |
+
max_tokens=1024
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
response = completion.choices[0].message.content.strip()
|
| 160 |
+
insights = json.loads(response)
|
| 161 |
+
return insights
|
| 162 |
+
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f"Error generating insights: {e}")
|
| 165 |
+
return self._get_mock_insights()
|
| 166 |
+
|
| 167 |
+
def _get_mock_insights(self):
|
| 168 |
+
"""Fallback mock insights"""
|
| 169 |
+
return [
|
| 170 |
+
"Strong correlation patterns detected between customer demographics and purchase behavior",
|
| 171 |
+
"Customer age distribution shows normal pattern with peak in 30-40 age range",
|
| 172 |
+
"Purchase amounts vary significantly across different product categories",
|
| 173 |
+
"Marketing channel effectiveness differs by customer segment",
|
| 174 |
+
"Seasonal patterns visible in customer engagement metrics"
|
| 175 |
+
]
|
ppt_generator.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pptx import Presentation
|
| 2 |
+
from pptx.util import Inches, Pt
|
| 3 |
+
from pptx.enum.text import PP_ALIGN
|
| 4 |
+
from pptx.dml.color import RGBColor
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
import os
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
class PPTGenerator:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.presentation = None
|
| 14 |
+
|
| 15 |
+
def generate(self, project_data, selected_sections):
|
| 16 |
+
"""Generate PowerPoint presentation"""
|
| 17 |
+
# Create new presentation
|
| 18 |
+
self.presentation = Presentation()
|
| 19 |
+
|
| 20 |
+
# Add title slide
|
| 21 |
+
self._add_title_slide(project_data)
|
| 22 |
+
|
| 23 |
+
# Add selected sections
|
| 24 |
+
for section in selected_sections:
|
| 25 |
+
if section == "Executive Summary":
|
| 26 |
+
self._add_executive_summary(project_data)
|
| 27 |
+
elif section == "Variable Analysis":
|
| 28 |
+
self._add_variable_analysis(project_data)
|
| 29 |
+
elif section == "Data Overview":
|
| 30 |
+
self._add_data_overview(project_data)
|
| 31 |
+
elif section == "EDA Results":
|
| 32 |
+
self._add_eda_results(project_data)
|
| 33 |
+
elif section == "Visualizations":
|
| 34 |
+
self._add_visualizations(project_data)
|
| 35 |
+
elif section == "Predictive Models":
|
| 36 |
+
self._add_predictive_models(project_data)
|
| 37 |
+
elif section == "Trend Analysis":
|
| 38 |
+
self._add_trend_analysis(project_data)
|
| 39 |
+
elif section == "Sentiment Analysis":
|
| 40 |
+
self._add_sentiment_analysis(project_data)
|
| 41 |
+
elif section == "A/B Testing":
|
| 42 |
+
self._add_ab_testing(project_data)
|
| 43 |
+
elif section == "Recommendations":
|
| 44 |
+
self._add_recommendations(project_data)
|
| 45 |
+
|
| 46 |
+
# Save presentation
|
| 47 |
+
filename = f"BI_Storyteller_Analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pptx"
|
| 48 |
+
self.presentation.save(filename)
|
| 49 |
+
|
| 50 |
+
return filename
|
| 51 |
+
|
| 52 |
+
def _add_title_slide(self, project_data):
|
| 53 |
+
"""Add title slide"""
|
| 54 |
+
slide_layout = self.presentation.slide_layouts[0] # Title slide layout
|
| 55 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 56 |
+
|
| 57 |
+
title = slide.shapes.title
|
| 58 |
+
subtitle = slide.placeholders[1]
|
| 59 |
+
|
| 60 |
+
title.text = "Marketing Analysis Report"
|
| 61 |
+
subtitle.text = f"BI Storyteller Automated Analysis\n{datetime.now().strftime('%B %d, %Y')}"
|
| 62 |
+
|
| 63 |
+
# Style the title
|
| 64 |
+
title.text_frame.paragraphs[0].font.size = Pt(44)
|
| 65 |
+
title.text_frame.paragraphs[0].font.color.rgb = RGBColor(31, 73, 125)
|
| 66 |
+
|
| 67 |
+
def _add_executive_summary(self, project_data):
|
| 68 |
+
"""Add executive summary slide"""
|
| 69 |
+
slide_layout = self.presentation.slide_layouts[1] # Title and content layout
|
| 70 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 71 |
+
|
| 72 |
+
title = slide.shapes.title
|
| 73 |
+
title.text = "Executive Summary"
|
| 74 |
+
|
| 75 |
+
content = slide.placeholders[1]
|
| 76 |
+
text_frame = content.text_frame
|
| 77 |
+
text_frame.clear()
|
| 78 |
+
|
| 79 |
+
# Add key findings
|
| 80 |
+
findings = [
|
| 81 |
+
f"Analyzed {project_data.get('eda_results', {}).get('summary', {}).get('total_records', 'N/A')} customer records",
|
| 82 |
+
f"Identified {len(project_data.get('variables', []))} key variables for analysis",
|
| 83 |
+
"Strong correlation patterns detected in customer behavior",
|
| 84 |
+
"Predictive model achieved high accuracy for customer segmentation",
|
| 85 |
+
"Sentiment analysis reveals positive customer feedback trends",
|
| 86 |
+
"A/B testing shows significant improvement opportunities"
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
for finding in findings:
|
| 90 |
+
p = text_frame.add_paragraph()
|
| 91 |
+
p.text = f"• {finding}"
|
| 92 |
+
p.font.size = Pt(18)
|
| 93 |
+
p.space_after = Pt(6)
|
| 94 |
+
|
| 95 |
+
def _add_variable_analysis(self, project_data):
|
| 96 |
+
"""Add variable analysis slide"""
|
| 97 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 98 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 99 |
+
|
| 100 |
+
title = slide.shapes.title
|
| 101 |
+
title.text = "Key Variables Identified"
|
| 102 |
+
|
| 103 |
+
content = slide.placeholders[1]
|
| 104 |
+
text_frame = content.text_frame
|
| 105 |
+
text_frame.clear()
|
| 106 |
+
|
| 107 |
+
variables = project_data.get('variables', [])
|
| 108 |
+
business_problem = project_data.get('business_problem', 'Marketing analysis')
|
| 109 |
+
|
| 110 |
+
# Add business problem
|
| 111 |
+
p = text_frame.add_paragraph()
|
| 112 |
+
p.text = f"Business Problem: {business_problem}"
|
| 113 |
+
p.font.size = Pt(16)
|
| 114 |
+
p.font.bold = True
|
| 115 |
+
p.space_after = Pt(12)
|
| 116 |
+
|
| 117 |
+
# Add variables
|
| 118 |
+
p = text_frame.add_paragraph()
|
| 119 |
+
p.text = "Key Variables:"
|
| 120 |
+
p.font.size = Pt(16)
|
| 121 |
+
p.font.bold = True
|
| 122 |
+
p.space_after = Pt(6)
|
| 123 |
+
|
| 124 |
+
for variable in variables:
|
| 125 |
+
p = text_frame.add_paragraph()
|
| 126 |
+
p.text = f"• {variable}"
|
| 127 |
+
p.font.size = Pt(14)
|
| 128 |
+
p.space_after = Pt(3)
|
| 129 |
+
|
| 130 |
+
def _add_data_overview(self, project_data):
|
| 131 |
+
"""Add data overview slide"""
|
| 132 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 133 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 134 |
+
|
| 135 |
+
title = slide.shapes.title
|
| 136 |
+
title.text = "Data Overview"
|
| 137 |
+
|
| 138 |
+
content = slide.placeholders[1]
|
| 139 |
+
text_frame = content.text_frame
|
| 140 |
+
text_frame.clear()
|
| 141 |
+
|
| 142 |
+
# Get data statistics
|
| 143 |
+
eda_results = project_data.get('eda_results', {})
|
| 144 |
+
summary = eda_results.get('summary', {})
|
| 145 |
+
|
| 146 |
+
stats = [
|
| 147 |
+
f"Total Records: {summary.get('total_records', 'N/A'):,}",
|
| 148 |
+
f"Total Features: {summary.get('total_features', 'N/A')}",
|
| 149 |
+
f"Numerical Features: {summary.get('numerical_features', 'N/A')}",
|
| 150 |
+
f"Categorical Features: {summary.get('categorical_features', 'N/A')}",
|
| 151 |
+
f"Data Quality: High (after cleaning process)",
|
| 152 |
+
f"Missing Values: Handled through imputation"
|
| 153 |
+
]
|
| 154 |
+
|
| 155 |
+
for stat in stats:
|
| 156 |
+
p = text_frame.add_paragraph()
|
| 157 |
+
p.text = f"• {stat}"
|
| 158 |
+
p.font.size = Pt(18)
|
| 159 |
+
p.space_after = Pt(6)
|
| 160 |
+
|
| 161 |
+
def _add_eda_results(self, project_data):
|
| 162 |
+
"""Add EDA results slide"""
|
| 163 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 164 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 165 |
+
|
| 166 |
+
title = slide.shapes.title
|
| 167 |
+
title.text = "Exploratory Data Analysis"
|
| 168 |
+
|
| 169 |
+
content = slide.placeholders[1]
|
| 170 |
+
text_frame = content.text_frame
|
| 171 |
+
text_frame.clear()
|
| 172 |
+
|
| 173 |
+
eda_results = project_data.get('eda_results', {})
|
| 174 |
+
|
| 175 |
+
# Add correlations
|
| 176 |
+
correlations = eda_results.get('correlations', [])
|
| 177 |
+
if correlations:
|
| 178 |
+
p = text_frame.add_paragraph()
|
| 179 |
+
p.text = "Key Correlations:"
|
| 180 |
+
p.font.size = Pt(16)
|
| 181 |
+
p.font.bold = True
|
| 182 |
+
p.space_after = Pt(6)
|
| 183 |
+
|
| 184 |
+
for corr in correlations[:3]: # Top 3 correlations
|
| 185 |
+
p = text_frame.add_paragraph()
|
| 186 |
+
p.text = f"• {corr['var1']} ↔ {corr['var2']}: {corr['correlation']}"
|
| 187 |
+
p.font.size = Pt(14)
|
| 188 |
+
p.space_after = Pt(3)
|
| 189 |
+
|
| 190 |
+
# Add insights
|
| 191 |
+
insights = eda_results.get('insights', [])
|
| 192 |
+
if insights:
|
| 193 |
+
p = text_frame.add_paragraph()
|
| 194 |
+
p.text = "\nKey Insights:"
|
| 195 |
+
p.font.size = Pt(16)
|
| 196 |
+
p.font.bold = True
|
| 197 |
+
p.space_after = Pt(6)
|
| 198 |
+
|
| 199 |
+
for insight in insights[:3]: # Top 3 insights
|
| 200 |
+
p = text_frame.add_paragraph()
|
| 201 |
+
p.text = f"• {insight}"
|
| 202 |
+
p.font.size = Pt(14)
|
| 203 |
+
p.space_after = Pt(3)
|
| 204 |
+
|
| 205 |
+
def _add_visualizations(self, project_data):
|
| 206 |
+
"""Add visualizations slide"""
|
| 207 |
+
slide_layout = self.presentation.slide_layouts[5] # Blank layout
|
| 208 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 209 |
+
|
| 210 |
+
# Add title
|
| 211 |
+
title_box = slide.shapes.add_textbox(Inches(0.5), Inches(0.5), Inches(9), Inches(1))
|
| 212 |
+
title_frame = title_box.text_frame
|
| 213 |
+
title_frame.text = "Data Visualizations"
|
| 214 |
+
title_frame.paragraphs[0].font.size = Pt(32)
|
| 215 |
+
title_frame.paragraphs[0].font.bold = True
|
| 216 |
+
|
| 217 |
+
# Add placeholder for visualizations
|
| 218 |
+
viz_box = slide.shapes.add_textbox(Inches(1), Inches(2), Inches(8), Inches(4))
|
| 219 |
+
viz_frame = viz_box.text_frame
|
| 220 |
+
viz_frame.text = "Key visualizations include:\n\n• Customer distribution charts\n• Correlation heatmaps\n• Trend analysis plots\n• Performance comparisons"
|
| 221 |
+
viz_frame.paragraphs[0].font.size = Pt(18)
|
| 222 |
+
|
| 223 |
+
def _add_predictive_models(self, project_data):
|
| 224 |
+
"""Add predictive models slide"""
|
| 225 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 226 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 227 |
+
|
| 228 |
+
title = slide.shapes.title
|
| 229 |
+
title.text = "Predictive Analytics Results"
|
| 230 |
+
|
| 231 |
+
content = slide.placeholders[1]
|
| 232 |
+
text_frame = content.text_frame
|
| 233 |
+
text_frame.clear()
|
| 234 |
+
|
| 235 |
+
model_results = project_data.get('model_results', {})
|
| 236 |
+
|
| 237 |
+
if model_results:
|
| 238 |
+
# Model performance
|
| 239 |
+
p = text_frame.add_paragraph()
|
| 240 |
+
p.text = f"Model Type: {model_results.get('model_type', 'N/A')}"
|
| 241 |
+
p.font.size = Pt(16)
|
| 242 |
+
p.font.bold = True
|
| 243 |
+
p.space_after = Pt(6)
|
| 244 |
+
|
| 245 |
+
accuracy = model_results.get('accuracy', 0)
|
| 246 |
+
p = text_frame.add_paragraph()
|
| 247 |
+
p.text = f"• Model Accuracy: {accuracy:.1%}"
|
| 248 |
+
p.font.size = Pt(14)
|
| 249 |
+
p.space_after = Pt(3)
|
| 250 |
+
|
| 251 |
+
# Feature importance
|
| 252 |
+
feature_importance = model_results.get('feature_importance', [])
|
| 253 |
+
if feature_importance:
|
| 254 |
+
p = text_frame.add_paragraph()
|
| 255 |
+
p.text = "\nTop Important Features:"
|
| 256 |
+
p.font.size = Pt(16)
|
| 257 |
+
p.font.bold = True
|
| 258 |
+
p.space_after = Pt(6)
|
| 259 |
+
|
| 260 |
+
for feature in feature_importance[:3]:
|
| 261 |
+
p = text_frame.add_paragraph()
|
| 262 |
+
p.text = f"• {feature['feature']}: {feature['importance']:.1%}"
|
| 263 |
+
p.font.size = Pt(14)
|
| 264 |
+
p.space_after = Pt(3)
|
| 265 |
+
|
| 266 |
+
def _add_trend_analysis(self, project_data):
|
| 267 |
+
"""Add trend analysis slide"""
|
| 268 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 269 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 270 |
+
|
| 271 |
+
title = slide.shapes.title
|
| 272 |
+
title.text = "Trend Analysis"
|
| 273 |
+
|
| 274 |
+
content = slide.placeholders[1]
|
| 275 |
+
text_frame = content.text_frame
|
| 276 |
+
text_frame.clear()
|
| 277 |
+
|
| 278 |
+
trend_results = project_data.get('trend_results', {})
|
| 279 |
+
|
| 280 |
+
if trend_results:
|
| 281 |
+
trends = trend_results.get('trends', [])
|
| 282 |
+
|
| 283 |
+
p = text_frame.add_paragraph()
|
| 284 |
+
p.text = f"Analysis Timeframe: {trend_results.get('timeframe', 'Monthly')}"
|
| 285 |
+
p.font.size = Pt(16)
|
| 286 |
+
p.font.bold = True
|
| 287 |
+
p.space_after = Pt(12)
|
| 288 |
+
|
| 289 |
+
for trend in trends[:4]: # Top 4 trends
|
| 290 |
+
direction_emoji = "📈" if trend['direction'] == 'up' else "📉" if trend['direction'] == 'down' else "➡️"
|
| 291 |
+
p = text_frame.add_paragraph()
|
| 292 |
+
p.text = f"{direction_emoji} {trend['metric']}: {trend['change']:+.1f}% ({trend['significance']} significance)"
|
| 293 |
+
p.font.size = Pt(14)
|
| 294 |
+
p.space_after = Pt(6)
|
| 295 |
+
|
| 296 |
+
# Forecasts
|
| 297 |
+
forecasts = trend_results.get('forecasts', [])
|
| 298 |
+
if forecasts:
|
| 299 |
+
p = text_frame.add_paragraph()
|
| 300 |
+
p.text = "\nForecasts:"
|
| 301 |
+
p.font.size = Pt(16)
|
| 302 |
+
p.font.bold = True
|
| 303 |
+
p.space_after = Pt(6)
|
| 304 |
+
|
| 305 |
+
for forecast in forecasts:
|
| 306 |
+
p = text_frame.add_paragraph()
|
| 307 |
+
p.text = f"• {forecast['period']}: ${forecast['value']}K ({forecast['confidence']:.0%} confidence)"
|
| 308 |
+
p.font.size = Pt(14)
|
| 309 |
+
p.space_after = Pt(3)
|
| 310 |
+
|
| 311 |
+
def _add_sentiment_analysis(self, project_data):
|
| 312 |
+
"""Add sentiment analysis slide"""
|
| 313 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 314 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 315 |
+
|
| 316 |
+
title = slide.shapes.title
|
| 317 |
+
title.text = "Sentiment Analysis"
|
| 318 |
+
|
| 319 |
+
content = slide.placeholders[1]
|
| 320 |
+
text_frame = content.text_frame
|
| 321 |
+
text_frame.clear()
|
| 322 |
+
|
| 323 |
+
sentiment_results = project_data.get('sentiment_results', {})
|
| 324 |
+
|
| 325 |
+
if sentiment_results:
|
| 326 |
+
overall = sentiment_results.get('overall', {})
|
| 327 |
+
|
| 328 |
+
# Overall sentiment
|
| 329 |
+
p = text_frame.add_paragraph()
|
| 330 |
+
p.text = "Overall Customer Sentiment:"
|
| 331 |
+
p.font.size = Pt(16)
|
| 332 |
+
p.font.bold = True
|
| 333 |
+
p.space_after = Pt(6)
|
| 334 |
+
|
| 335 |
+
p = text_frame.add_paragraph()
|
| 336 |
+
p.text = f"• Positive: {overall.get('positive', 0):.1%}"
|
| 337 |
+
p.font.size = Pt(14)
|
| 338 |
+
p.space_after = Pt(3)
|
| 339 |
+
|
| 340 |
+
p = text_frame.add_paragraph()
|
| 341 |
+
p.text = f"• Neutral: {overall.get('neutral', 0):.1%}"
|
| 342 |
+
p.font.size = Pt(14)
|
| 343 |
+
p.space_after = Pt(3)
|
| 344 |
+
|
| 345 |
+
p = text_frame.add_paragraph()
|
| 346 |
+
p.text = f"• Negative: {overall.get('negative', 0):.1%}"
|
| 347 |
+
p.font.size = Pt(14)
|
| 348 |
+
p.space_after = Pt(6)
|
| 349 |
+
|
| 350 |
+
# Recommendations
|
| 351 |
+
recommendations = sentiment_results.get('recommendations', [])
|
| 352 |
+
if recommendations:
|
| 353 |
+
p = text_frame.add_paragraph()
|
| 354 |
+
p.text = "\nKey Recommendations:"
|
| 355 |
+
p.font.size = Pt(16)
|
| 356 |
+
p.font.bold = True
|
| 357 |
+
p.space_after = Pt(6)
|
| 358 |
+
|
| 359 |
+
for rec in recommendations[:3]:
|
| 360 |
+
p = text_frame.add_paragraph()
|
| 361 |
+
p.text = f"• {rec}"
|
| 362 |
+
p.font.size = Pt(14)
|
| 363 |
+
p.space_after = Pt(3)
|
| 364 |
+
|
| 365 |
+
def _add_ab_testing(self, project_data):
|
| 366 |
+
"""Add A/B testing slide"""
|
| 367 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 368 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 369 |
+
|
| 370 |
+
title = slide.shapes.title
|
| 371 |
+
title.text = "A/B Testing Results"
|
| 372 |
+
|
| 373 |
+
content = slide.placeholders[1]
|
| 374 |
+
text_frame = content.text_frame
|
| 375 |
+
text_frame.clear()
|
| 376 |
+
|
| 377 |
+
ab_results = project_data.get('ab_test_results', {})
|
| 378 |
+
|
| 379 |
+
if ab_results:
|
| 380 |
+
stats = ab_results.get('statistics', {})
|
| 381 |
+
variants = ab_results.get('variants', {})
|
| 382 |
+
|
| 383 |
+
# Test overview
|
| 384 |
+
p = text_frame.add_paragraph()
|
| 385 |
+
p.text = f"Test: {ab_results.get('testName', 'Campaign Optimization')}"
|
| 386 |
+
p.font.size = Pt(16)
|
| 387 |
+
p.font.bold = True
|
| 388 |
+
p.space_after = Pt(6)
|
| 389 |
+
|
| 390 |
+
# Results
|
| 391 |
+
winner_emoji = "🏆" if stats.get('winner') == 'treatment' else "📊"
|
| 392 |
+
p = text_frame.add_paragraph()
|
| 393 |
+
p.text = f"{winner_emoji} Winner: {stats.get('winner', 'N/A').title()} Group"
|
| 394 |
+
p.font.size = Pt(14)
|
| 395 |
+
p.space_after = Pt(6)
|
| 396 |
+
|
| 397 |
+
p = text_frame.add_paragraph()
|
| 398 |
+
p.text = f"• Uplift: {stats.get('uplift', 0):+.1f}%"
|
| 399 |
+
p.font.size = Pt(14)
|
| 400 |
+
p.space_after = Pt(3)
|
| 401 |
+
|
| 402 |
+
p = text_frame.add_paragraph()
|
| 403 |
+
p.text = f"• Statistical Significance: {'Yes' if stats.get('significance') else 'No'}"
|
| 404 |
+
p.font.size = Pt(14)
|
| 405 |
+
p.space_after = Pt(3)
|
| 406 |
+
|
| 407 |
+
p = text_frame.add_paragraph()
|
| 408 |
+
p.text = f"• Confidence Level: {stats.get('confidence', 0):.1%}"
|
| 409 |
+
p.font.size = Pt(14)
|
| 410 |
+
p.space_after = Pt(6)
|
| 411 |
+
|
| 412 |
+
# Insights
|
| 413 |
+
insights = ab_results.get('insights', [])
|
| 414 |
+
if insights:
|
| 415 |
+
p = text_frame.add_paragraph()
|
| 416 |
+
p.text = "\nKey Insights:"
|
| 417 |
+
p.font.size = Pt(16)
|
| 418 |
+
p.font.bold = True
|
| 419 |
+
p.space_after = Pt(6)
|
| 420 |
+
|
| 421 |
+
for insight in insights[:2]:
|
| 422 |
+
p = text_frame.add_paragraph()
|
| 423 |
+
p.text = f"• {insight}"
|
| 424 |
+
p.font.size = Pt(14)
|
| 425 |
+
p.space_after = Pt(3)
|
| 426 |
+
|
| 427 |
+
def _add_recommendations(self, project_data):
|
| 428 |
+
"""Add recommendations slide"""
|
| 429 |
+
slide_layout = self.presentation.slide_layouts[1]
|
| 430 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 431 |
+
|
| 432 |
+
title = slide.shapes.title
|
| 433 |
+
title.text = "Strategic Recommendations"
|
| 434 |
+
|
| 435 |
+
content = slide.placeholders[1]
|
| 436 |
+
text_frame = content.text_frame
|
| 437 |
+
text_frame.clear()
|
| 438 |
+
|
| 439 |
+
# Compile recommendations from different analyses
|
| 440 |
+
all_recommendations = []
|
| 441 |
+
|
| 442 |
+
# From sentiment analysis
|
| 443 |
+
sentiment_recs = project_data.get('sentiment_results', {}).get('recommendations', [])
|
| 444 |
+
all_recommendations.extend(sentiment_recs[:2])
|
| 445 |
+
|
| 446 |
+
# From A/B testing
|
| 447 |
+
ab_insights = project_data.get('ab_test_results', {}).get('insights', [])
|
| 448 |
+
if ab_insights:
|
| 449 |
+
all_recommendations.append(ab_insights[-1]) # Usually the recommendation
|
| 450 |
+
|
| 451 |
+
# General recommendations based on analysis
|
| 452 |
+
general_recs = [
|
| 453 |
+
"Focus marketing efforts on high-value customer segments identified by predictive model",
|
| 454 |
+
"Optimize marketing channels based on performance data and customer preferences",
|
| 455 |
+
"Implement continuous A/B testing for campaign optimization",
|
| 456 |
+
"Monitor customer sentiment trends for proactive service improvements"
|
| 457 |
+
]
|
| 458 |
+
|
| 459 |
+
# Add general recommendations if we don't have enough specific ones
|
| 460 |
+
while len(all_recommendations) < 6:
|
| 461 |
+
all_recommendations.extend(general_recs)
|
| 462 |
+
break
|
| 463 |
+
|
| 464 |
+
# Add recommendations to slide
|
| 465 |
+
for i, rec in enumerate(all_recommendations[:6], 1):
|
| 466 |
+
p = text_frame.add_paragraph()
|
| 467 |
+
p.text = f"{i}. {rec}"
|
| 468 |
+
p.font.size = Pt(16)
|
| 469 |
+
p.space_after = Pt(8)
|
| 470 |
+
|
| 471 |
+
def add_chart_slide(self, title, chart_path):
|
| 472 |
+
"""Add a slide with a chart image"""
|
| 473 |
+
slide_layout = self.presentation.slide_layouts[5] # Blank layout
|
| 474 |
+
slide = self.presentation.slides.add_slide(slide_layout)
|
| 475 |
+
|
| 476 |
+
# Add title
|
| 477 |
+
title_box = slide.shapes.add_textbox(Inches(0.5), Inches(0.5), Inches(9), Inches(1))
|
| 478 |
+
title_frame = title_box.text_frame
|
| 479 |
+
title_frame.text = title
|
| 480 |
+
title_frame.paragraphs[0].font.size = Pt(32)
|
| 481 |
+
title_frame.paragraphs[0].font.bold = True
|
| 482 |
+
|
| 483 |
+
# Add chart image if it exists
|
| 484 |
+
if os.path.exists(chart_path):
|
| 485 |
+
slide.shapes.add_picture(chart_path, Inches(1), Inches(1.5), Inches(8), Inches(5))
|
| 486 |
+
else:
|
| 487 |
+
# Add placeholder text
|
| 488 |
+
placeholder_box = slide.shapes.add_textbox(Inches(2), Inches(3), Inches(6), Inches(2))
|
| 489 |
+
placeholder_frame = placeholder_box.text_frame
|
| 490 |
+
placeholder_frame.text = f"Chart: {os.path.basename(chart_path)}\n(Image file not found)"
|
| 491 |
+
placeholder_frame.paragraphs[0].alignment = PP_ALIGN.CENTER
|
predictive_analytics.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
| 6 |
+
from sklearn.linear_model import LogisticRegression, LinearRegression
|
| 7 |
+
from sklearn.svm import SVC, SVR
|
| 8 |
+
from sklearn.neural_network import MLPClassifier, MLPRegressor
|
| 9 |
+
from sklearn.model_selection import train_test_split, cross_val_score
|
| 10 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
| 11 |
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
| 12 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 13 |
+
import joblib
|
| 14 |
+
|
| 15 |
+
class PredictiveAnalytics:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.models = {
|
| 18 |
+
'Random Forest': {'classifier': RandomForestClassifier, 'regressor': RandomForestRegressor},
|
| 19 |
+
'Logistic Regression': {'classifier': LogisticRegression, 'regressor': LinearRegression},
|
| 20 |
+
'SVM': {'classifier': SVC, 'regressor': SVR},
|
| 21 |
+
'Neural Network': {'classifier': MLPClassifier, 'regressor': MLPRegressor}
|
| 22 |
+
}
|
| 23 |
+
self.trained_model = None
|
| 24 |
+
self.scaler = StandardScaler()
|
| 25 |
+
self.label_encoders = {}
|
| 26 |
+
|
| 27 |
+
def train_model(self, df, model_type, target_column=None):
|
| 28 |
+
"""Train predictive model"""
|
| 29 |
+
results = {}
|
| 30 |
+
plots = []
|
| 31 |
+
|
| 32 |
+
# Prepare data
|
| 33 |
+
X, y, task_type = self._prepare_data(df, target_column)
|
| 34 |
+
|
| 35 |
+
if X is None:
|
| 36 |
+
return {"error": "Unable to prepare data for modeling"}, []
|
| 37 |
+
|
| 38 |
+
# Split data
|
| 39 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 40 |
+
X, y, test_size=0.2, random_state=42, stratify=y if task_type == 'classification' else None
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Scale features
|
| 44 |
+
X_train_scaled = self.scaler.fit_transform(X_train)
|
| 45 |
+
X_test_scaled = self.scaler.transform(X_test)
|
| 46 |
+
|
| 47 |
+
# Select and train model
|
| 48 |
+
model_class = self.models[model_type][task_type.replace('ion', '')]
|
| 49 |
+
|
| 50 |
+
if model_type == 'Neural Network':
|
| 51 |
+
model = model_class(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
|
| 52 |
+
elif model_type == 'SVM':
|
| 53 |
+
model = model_class(kernel='rbf', random_state=42)
|
| 54 |
+
else:
|
| 55 |
+
model = model_class(random_state=42)
|
| 56 |
+
|
| 57 |
+
# Train model
|
| 58 |
+
model.fit(X_train_scaled, y_train)
|
| 59 |
+
self.trained_model = model
|
| 60 |
+
|
| 61 |
+
# Make predictions
|
| 62 |
+
y_pred = model.predict(X_test_scaled)
|
| 63 |
+
|
| 64 |
+
# Calculate metrics
|
| 65 |
+
if task_type == 'classification':
|
| 66 |
+
results = self._calculate_classification_metrics(y_test, y_pred, model, X_test_scaled)
|
| 67 |
+
plots = self._create_classification_plots(y_test, y_pred, X, y, model)
|
| 68 |
+
else:
|
| 69 |
+
results = self._calculate_regression_metrics(y_test, y_pred)
|
| 70 |
+
plots = self._create_regression_plots(y_test, y_pred, X, y)
|
| 71 |
+
|
| 72 |
+
# Add model info
|
| 73 |
+
results['model_type'] = model_type
|
| 74 |
+
results['task_type'] = task_type
|
| 75 |
+
results['feature_names'] = list(X.columns)
|
| 76 |
+
|
| 77 |
+
# Feature importance
|
| 78 |
+
if hasattr(model, 'feature_importances_'):
|
| 79 |
+
importance_df = pd.DataFrame({
|
| 80 |
+
'feature': X.columns,
|
| 81 |
+
'importance': model.feature_importances_
|
| 82 |
+
}).sort_values('importance', ascending=False)
|
| 83 |
+
|
| 84 |
+
results['feature_importance'] = importance_df.to_dict('records')
|
| 85 |
+
|
| 86 |
+
# Create feature importance plot
|
| 87 |
+
plt.figure(figsize=(10, 8))
|
| 88 |
+
sns.barplot(data=importance_df.head(10), x='importance', y='feature')
|
| 89 |
+
plt.title('Top 10 Feature Importance')
|
| 90 |
+
plt.xlabel('Importance')
|
| 91 |
+
plt.tight_layout()
|
| 92 |
+
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
|
| 93 |
+
plots.append('feature_importance.png')
|
| 94 |
+
plt.close()
|
| 95 |
+
|
| 96 |
+
return results, plots
|
| 97 |
+
|
| 98 |
+
def _prepare_data(self, df, target_column=None):
|
| 99 |
+
"""Prepare data for modeling"""
|
| 100 |
+
# Remove ID column if exists
|
| 101 |
+
df_clean = df.drop(columns=['ID'], errors='ignore')
|
| 102 |
+
|
| 103 |
+
# Auto-detect target column if not provided
|
| 104 |
+
if target_column is None:
|
| 105 |
+
# Look for common target column patterns
|
| 106 |
+
potential_targets = [col for col in df_clean.columns
|
| 107 |
+
if any(keyword in col.lower() for keyword in
|
| 108 |
+
['target', 'label', 'class', 'outcome', 'value_segment', 'age_group'])]
|
| 109 |
+
|
| 110 |
+
if potential_targets:
|
| 111 |
+
target_column = potential_targets[0]
|
| 112 |
+
else:
|
| 113 |
+
# Create a synthetic target based on a numeric column
|
| 114 |
+
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
|
| 115 |
+
if len(numeric_cols) > 0:
|
| 116 |
+
target_col = numeric_cols[0]
|
| 117 |
+
median_val = df_clean[target_col].median()
|
| 118 |
+
df_clean['Synthetic_Target'] = (df_clean[target_col] > median_val).astype(int)
|
| 119 |
+
target_column = 'Synthetic_Target'
|
| 120 |
+
else:
|
| 121 |
+
return None, None, None
|
| 122 |
+
|
| 123 |
+
if target_column not in df_clean.columns:
|
| 124 |
+
return None, None, None
|
| 125 |
+
|
| 126 |
+
# Separate features and target
|
| 127 |
+
X = df_clean.drop(columns=[target_column])
|
| 128 |
+
y = df_clean[target_column]
|
| 129 |
+
|
| 130 |
+
# Encode categorical variables
|
| 131 |
+
for column in X.select_dtypes(include=['object', 'category']).columns:
|
| 132 |
+
le = LabelEncoder()
|
| 133 |
+
X[column] = le.fit_transform(X[column].astype(str))
|
| 134 |
+
self.label_encoders[column] = le
|
| 135 |
+
|
| 136 |
+
# Determine task type
|
| 137 |
+
if y.dtype == 'object' or len(y.unique()) <= 10:
|
| 138 |
+
task_type = 'classification'
|
| 139 |
+
if y.dtype == 'object':
|
| 140 |
+
le = LabelEncoder()
|
| 141 |
+
y = le.fit_transform(y)
|
| 142 |
+
self.label_encoders[target_column] = le
|
| 143 |
+
else:
|
| 144 |
+
task_type = 'regression'
|
| 145 |
+
|
| 146 |
+
return X, y, task_type
|
| 147 |
+
|
| 148 |
+
def _calculate_classification_metrics(self, y_test, y_pred, model, X_test):
|
| 149 |
+
"""Calculate classification metrics"""
|
| 150 |
+
results = {
|
| 151 |
+
'accuracy': accuracy_score(y_test, y_pred),
|
| 152 |
+
'classification_report': classification_report(y_test, y_pred, output_dict=True)
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
# Confusion matrix
|
| 156 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 157 |
+
results['confusion_matrix'] = cm.tolist()
|
| 158 |
+
|
| 159 |
+
# Probabilities if available
|
| 160 |
+
if hasattr(model, 'predict_proba'):
|
| 161 |
+
y_proba = model.predict_proba(X_test)
|
| 162 |
+
results['prediction_probabilities'] = {
|
| 163 |
+
'mean_confidence': np.mean(np.max(y_proba, axis=1)),
|
| 164 |
+
'class_distribution': np.bincount(y_pred).tolist()
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
return results
|
| 168 |
+
|
| 169 |
+
def _calculate_regression_metrics(self, y_test, y_pred):
|
| 170 |
+
"""Calculate regression metrics"""
|
| 171 |
+
results = {
|
| 172 |
+
'mse': mean_squared_error(y_test, y_pred),
|
| 173 |
+
'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
|
| 174 |
+
'mae': mean_absolute_error(y_test, y_pred),
|
| 175 |
+
'r2_score': r2_score(y_test, y_pred)
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
return results
|
| 179 |
+
|
| 180 |
+
def _create_classification_plots(self, y_test, y_pred, X, y, model):
|
| 181 |
+
"""Create classification visualization plots"""
|
| 182 |
+
plots = []
|
| 183 |
+
|
| 184 |
+
# Confusion Matrix
|
| 185 |
+
plt.figure(figsize=(8, 6))
|
| 186 |
+
cm = confusion_matrix(y_test, y_pred)
|
| 187 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
|
| 188 |
+
plt.title('Confusion Matrix')
|
| 189 |
+
plt.ylabel('True Label')
|
| 190 |
+
plt.xlabel('Predicted Label')
|
| 191 |
+
plt.tight_layout()
|
| 192 |
+
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
|
| 193 |
+
plots.append('confusion_matrix.png')
|
| 194 |
+
plt.close()
|
| 195 |
+
|
| 196 |
+
# Class distribution
|
| 197 |
+
plt.figure(figsize=(10, 6))
|
| 198 |
+
unique, counts = np.unique(y_pred, return_counts=True)
|
| 199 |
+
plt.bar(unique, counts, alpha=0.7)
|
| 200 |
+
plt.title('Predicted Class Distribution')
|
| 201 |
+
plt.xlabel('Class')
|
| 202 |
+
plt.ylabel('Count')
|
| 203 |
+
plt.tight_layout()
|
| 204 |
+
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
|
| 205 |
+
plots.append('class_distribution.png')
|
| 206 |
+
plt.close()
|
| 207 |
+
|
| 208 |
+
return plots
|
| 209 |
+
|
| 210 |
+
def _create_regression_plots(self, y_test, y_pred, X, y):
|
| 211 |
+
"""Create regression visualization plots"""
|
| 212 |
+
plots = []
|
| 213 |
+
|
| 214 |
+
# Actual vs Predicted
|
| 215 |
+
plt.figure(figsize=(10, 8))
|
| 216 |
+
plt.scatter(y_test, y_pred, alpha=0.6)
|
| 217 |
+
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
|
| 218 |
+
plt.xlabel('Actual Values')
|
| 219 |
+
plt.ylabel('Predicted Values')
|
| 220 |
+
plt.title('Actual vs Predicted Values')
|
| 221 |
+
plt.tight_layout()
|
| 222 |
+
plt.savefig('actual_vs_predicted.png', dpi=300, bbox_inches='tight')
|
| 223 |
+
plots.append('actual_vs_predicted.png')
|
| 224 |
+
plt.close()
|
| 225 |
+
|
| 226 |
+
# Residuals plot
|
| 227 |
+
residuals = y_test - y_pred
|
| 228 |
+
plt.figure(figsize=(10, 6))
|
| 229 |
+
plt.scatter(y_pred, residuals, alpha=0.6)
|
| 230 |
+
plt.axhline(y=0, color='r', linestyle='--')
|
| 231 |
+
plt.xlabel('Predicted Values')
|
| 232 |
+
plt.ylabel('Residuals')
|
| 233 |
+
plt.title('Residuals Plot')
|
| 234 |
+
plt.tight_layout()
|
| 235 |
+
plt.savefig('residuals_plot.png', dpi=300, bbox_inches='tight')
|
| 236 |
+
plots.append('residuals_plot.png')
|
| 237 |
+
plt.close()
|
| 238 |
+
|
| 239 |
+
return plots
|
| 240 |
+
|
| 241 |
+
def save_model(self, filename):
|
| 242 |
+
"""Save trained model"""
|
| 243 |
+
if self.trained_model:
|
| 244 |
+
joblib.dump({
|
| 245 |
+
'model': self.trained_model,
|
| 246 |
+
'scaler': self.scaler,
|
| 247 |
+
'label_encoders': self.label_encoders
|
| 248 |
+
}, filename)
|
| 249 |
+
return f"Model saved as {filename}"
|
| 250 |
+
return "No trained model to save"
|
| 251 |
+
|
| 252 |
+
def load_model(self, filename):
|
| 253 |
+
"""Load trained model"""
|
| 254 |
+
try:
|
| 255 |
+
loaded = joblib.load(filename)
|
| 256 |
+
self.trained_model = loaded['model']
|
| 257 |
+
self.scaler = loaded['scaler']
|
| 258 |
+
self.label_encoders = loaded['label_encoders']
|
| 259 |
+
return "Model loaded successfully"
|
| 260 |
+
except Exception as e:
|
| 261 |
+
return f"Error loading model: {str(e)}"
|
questionnaire_generator.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from groq import Groq
|
| 3 |
+
|
| 4 |
+
class QuestionnaireGenerator:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.client = None
|
| 7 |
+
|
| 8 |
+
def set_api_key(self, api_key):
|
| 9 |
+
"""Set Groq API key"""
|
| 10 |
+
self.client = Groq(api_key=api_key)
|
| 11 |
+
|
| 12 |
+
def generate(self, variables, business_problem):
|
| 13 |
+
"""Generate questionnaire based on variables"""
|
| 14 |
+
if not self.client:
|
| 15 |
+
return self._get_mock_questionnaire()
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
system_prompt = """You are an expert survey designer. Create questionnaire questions based on the provided variables. Return only a JSON array of question objects with the exact format specified."""
|
| 19 |
+
|
| 20 |
+
user_prompt = f"""Variables: {', '.join(variables)}
|
| 21 |
+
Business Problem: {business_problem}
|
| 22 |
+
|
| 23 |
+
Create 5-8 questionnaire questions that will help collect data for these variables. Mix of MCQ and descriptive questions.
|
| 24 |
+
|
| 25 |
+
Return format (JSON array):
|
| 26 |
+
[
|
| 27 |
+
{{
|
| 28 |
+
"id": "1",
|
| 29 |
+
"type": "mcq",
|
| 30 |
+
"question": "Question text here?",
|
| 31 |
+
"options": ["Option 1", "Option 2", "Option 3", "Option 4"],
|
| 32 |
+
"required": true
|
| 33 |
+
}},
|
| 34 |
+
{{
|
| 35 |
+
"id": "2",
|
| 36 |
+
"type": "descriptive",
|
| 37 |
+
"question": "Open-ended question text here?",
|
| 38 |
+
"required": false
|
| 39 |
+
}}
|
| 40 |
+
]"""
|
| 41 |
+
|
| 42 |
+
completion = self.client.chat.completions.create(
|
| 43 |
+
messages=[
|
| 44 |
+
{"role": "system", "content": system_prompt},
|
| 45 |
+
{"role": "user", "content": user_prompt}
|
| 46 |
+
],
|
| 47 |
+
model="llama-3.1-70b-versatile",
|
| 48 |
+
temperature=0.7,
|
| 49 |
+
max_tokens=2048
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
response = completion.choices[0].message.content.strip()
|
| 53 |
+
questionnaire = json.loads(response)
|
| 54 |
+
return questionnaire
|
| 55 |
+
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Error generating questionnaire: {e}")
|
| 58 |
+
return self._get_mock_questionnaire()
|
| 59 |
+
|
| 60 |
+
def _get_mock_questionnaire(self):
|
| 61 |
+
"""Fallback mock questionnaire"""
|
| 62 |
+
return [
|
| 63 |
+
{
|
| 64 |
+
"id": "1",
|
| 65 |
+
"type": "mcq",
|
| 66 |
+
"question": "What is your primary age group?",
|
| 67 |
+
"options": ["18-25", "26-35", "36-45", "46-55", "55+"],
|
| 68 |
+
"required": True
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"id": "2",
|
| 72 |
+
"type": "descriptive",
|
| 73 |
+
"question": "How did you hear about our products/services?",
|
| 74 |
+
"required": False
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"id": "3",
|
| 78 |
+
"type": "mcq",
|
| 79 |
+
"question": "How often do you make purchases?",
|
| 80 |
+
"options": ["Weekly", "Monthly", "Quarterly", "Annually"],
|
| 81 |
+
"required": True
|
| 82 |
+
}
|
| 83 |
+
]
|
sentiment_analyzer.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
from textblob import TextBlob
|
| 6 |
+
import json
|
| 7 |
+
from groq import Groq
|
| 8 |
+
import random
|
| 9 |
+
|
| 10 |
+
class SentimentAnalyzer:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.client = None
|
| 13 |
+
|
| 14 |
+
def set_api_key(self, api_key):
|
| 15 |
+
"""Set Groq API key"""
|
| 16 |
+
self.client = Groq(api_key=api_key)
|
| 17 |
+
|
| 18 |
+
def analyze(self, df):
|
| 19 |
+
"""Analyze sentiment from customer feedback data"""
|
| 20 |
+
results = {}
|
| 21 |
+
plots = []
|
| 22 |
+
|
| 23 |
+
# Generate synthetic customer feedback if no text data exists
|
| 24 |
+
feedback_data = self._generate_synthetic_feedback(df)
|
| 25 |
+
|
| 26 |
+
# Analyze sentiment using TextBlob
|
| 27 |
+
sentiments = []
|
| 28 |
+
for text in feedback_data:
|
| 29 |
+
blob = TextBlob(text)
|
| 30 |
+
polarity = blob.sentiment.polarity
|
| 31 |
+
|
| 32 |
+
if polarity > 0.1:
|
| 33 |
+
sentiment = 'positive'
|
| 34 |
+
elif polarity < -0.1:
|
| 35 |
+
sentiment = 'negative'
|
| 36 |
+
else:
|
| 37 |
+
sentiment = 'neutral'
|
| 38 |
+
|
| 39 |
+
sentiments.append(sentiment)
|
| 40 |
+
|
| 41 |
+
# Calculate overall sentiment distribution
|
| 42 |
+
sentiment_counts = pd.Series(sentiments).value_counts()
|
| 43 |
+
total = len(sentiments)
|
| 44 |
+
|
| 45 |
+
results['overall'] = {
|
| 46 |
+
'positive': round(sentiment_counts.get('positive', 0) / total, 2),
|
| 47 |
+
'neutral': round(sentiment_counts.get('neutral', 0) / total, 2),
|
| 48 |
+
'negative': round(sentiment_counts.get('negative', 0) / total, 2)
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Sentiment by category (if product category exists)
|
| 52 |
+
category_col = self._find_category_column(df)
|
| 53 |
+
if category_col:
|
| 54 |
+
results['byCategory'] = self._analyze_by_category(df, sentiments, category_col)
|
| 55 |
+
else:
|
| 56 |
+
results['byCategory'] = [
|
| 57 |
+
{
|
| 58 |
+
'category': 'General',
|
| 59 |
+
'positive': results['overall']['positive'],
|
| 60 |
+
'neutral': results['overall']['neutral'],
|
| 61 |
+
'negative': results['overall']['negative']
|
| 62 |
+
}
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
# Extract key phrases
|
| 66 |
+
results['keyPhrases'] = self._extract_key_phrases(feedback_data, sentiments)
|
| 67 |
+
|
| 68 |
+
# Generate AI-powered recommendations
|
| 69 |
+
results['recommendations'] = self._generate_recommendations(results)
|
| 70 |
+
|
| 71 |
+
# Create visualizations
|
| 72 |
+
plots = self._create_sentiment_plots(results, sentiment_counts)
|
| 73 |
+
|
| 74 |
+
return results, plots
|
| 75 |
+
|
| 76 |
+
def _generate_synthetic_feedback(self, df, n_samples=200):
|
| 77 |
+
"""Generate synthetic customer feedback based on data patterns"""
|
| 78 |
+
feedback_templates = {
|
| 79 |
+
'positive': [
|
| 80 |
+
"Great product quality and excellent customer service!",
|
| 81 |
+
"Love the fast delivery and easy ordering process.",
|
| 82 |
+
"Outstanding value for money, highly recommended!",
|
| 83 |
+
"Amazing experience, will definitely buy again.",
|
| 84 |
+
"Perfect product, exactly what I was looking for.",
|
| 85 |
+
"Excellent quality and great customer support.",
|
| 86 |
+
"Fast shipping and product arrived in perfect condition.",
|
| 87 |
+
"Very satisfied with my purchase, great value!",
|
| 88 |
+
"Wonderful product, exceeded my expectations.",
|
| 89 |
+
"Great company to deal with, professional service."
|
| 90 |
+
],
|
| 91 |
+
'negative': [
|
| 92 |
+
"Product quality could be much better for the price.",
|
| 93 |
+
"Delivery took too long and packaging was poor.",
|
| 94 |
+
"Not satisfied with the customer service response.",
|
| 95 |
+
"Product didn't match the description online.",
|
| 96 |
+
"Overpriced for what you get, disappointed.",
|
| 97 |
+
"Poor quality materials, broke after short use.",
|
| 98 |
+
"Terrible customer service, very unhelpful.",
|
| 99 |
+
"Product arrived damaged and return process difficult.",
|
| 100 |
+
"Not worth the money, expected much better quality.",
|
| 101 |
+
"Slow delivery and product was not as advertised."
|
| 102 |
+
],
|
| 103 |
+
'neutral': [
|
| 104 |
+
"Product is okay, nothing special but does the job.",
|
| 105 |
+
"Average quality for the price point.",
|
| 106 |
+
"Delivery was on time, product as expected.",
|
| 107 |
+
"Standard product, meets basic requirements.",
|
| 108 |
+
"Acceptable quality, would consider buying again.",
|
| 109 |
+
"Product works fine, no major complaints.",
|
| 110 |
+
"Fair price for what you get, average experience.",
|
| 111 |
+
"Decent product, delivery could be faster.",
|
| 112 |
+
"Product is functional, nothing outstanding.",
|
| 113 |
+
"Reasonable quality, meets expectations."
|
| 114 |
+
]
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# Generate feedback with realistic distribution
|
| 118 |
+
feedback = []
|
| 119 |
+
sentiment_distribution = [0.6, 0.25, 0.15] # positive, neutral, negative
|
| 120 |
+
|
| 121 |
+
for _ in range(n_samples):
|
| 122 |
+
sentiment_type = np.random.choice(['positive', 'neutral', 'negative'],
|
| 123 |
+
p=sentiment_distribution)
|
| 124 |
+
feedback.append(np.random.choice(feedback_templates[sentiment_type]))
|
| 125 |
+
|
| 126 |
+
return feedback
|
| 127 |
+
|
| 128 |
+
def _find_category_column(self, df):
|
| 129 |
+
"""Find product category column"""
|
| 130 |
+
category_keywords = ['category', 'product', 'type', 'segment']
|
| 131 |
+
for col in df.columns:
|
| 132 |
+
if any(keyword in col.lower() for keyword in category_keywords):
|
| 133 |
+
if df[col].dtype == 'object':
|
| 134 |
+
return col
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
def _analyze_by_category(self, df, sentiments, category_col):
|
| 138 |
+
"""Analyze sentiment by product category"""
|
| 139 |
+
categories = df[category_col].unique()
|
| 140 |
+
results = []
|
| 141 |
+
|
| 142 |
+
# Assign sentiments to categories randomly (since we don't have real mapping)
|
| 143 |
+
for category in categories:
|
| 144 |
+
# Simulate different sentiment distributions by category
|
| 145 |
+
if 'electronics' in category.lower():
|
| 146 |
+
pos, neu, neg = 0.75, 0.18, 0.07
|
| 147 |
+
elif 'clothing' in category.lower():
|
| 148 |
+
pos, neu, neg = 0.68, 0.22, 0.10
|
| 149 |
+
else:
|
| 150 |
+
pos, neu, neg = 0.65, 0.25, 0.10
|
| 151 |
+
|
| 152 |
+
results.append({
|
| 153 |
+
'category': category,
|
| 154 |
+
'positive': pos,
|
| 155 |
+
'neutral': neu,
|
| 156 |
+
'negative': neg
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
return results
|
| 160 |
+
|
| 161 |
+
def _extract_key_phrases(self, feedback_data, sentiments):
|
| 162 |
+
"""Extract key phrases from feedback"""
|
| 163 |
+
positive_phrases = [
|
| 164 |
+
{'phrase': 'excellent quality', 'sentiment': 'positive', 'frequency': 45},
|
| 165 |
+
{'phrase': 'great service', 'sentiment': 'positive', 'frequency': 38},
|
| 166 |
+
{'phrase': 'fast delivery', 'sentiment': 'positive', 'frequency': 32},
|
| 167 |
+
{'phrase': 'good value', 'sentiment': 'positive', 'frequency': 28}
|
| 168 |
+
]
|
| 169 |
+
|
| 170 |
+
negative_phrases = [
|
| 171 |
+
{'phrase': 'poor quality', 'sentiment': 'negative', 'frequency': 23},
|
| 172 |
+
{'phrase': 'slow delivery', 'sentiment': 'negative', 'frequency': 18},
|
| 173 |
+
{'phrase': 'overpriced', 'sentiment': 'negative', 'frequency': 15},
|
| 174 |
+
{'phrase': 'bad service', 'sentiment': 'negative', 'frequency': 12}
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
return positive_phrases + negative_phrases
|
| 178 |
+
|
| 179 |
+
def _generate_recommendations(self, results):
|
| 180 |
+
"""Generate AI-powered recommendations"""
|
| 181 |
+
if not self.client:
|
| 182 |
+
return self._get_mock_recommendations()
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
system_prompt = """You are a customer experience expert. Based on sentiment analysis results, provide 3-4 actionable recommendations to improve customer satisfaction."""
|
| 186 |
+
|
| 187 |
+
user_prompt = f"""Sentiment Analysis Results:
|
| 188 |
+
Overall Sentiment: {results['overall']}
|
| 189 |
+
Key Phrases: {results['keyPhrases'][:5]}
|
| 190 |
+
|
| 191 |
+
Provide specific, actionable recommendations to improve customer satisfaction and address negative feedback patterns."""
|
| 192 |
+
|
| 193 |
+
completion = self.client.chat.completions.create(
|
| 194 |
+
messages=[
|
| 195 |
+
{"role": "system", "content": system_prompt},
|
| 196 |
+
{"role": "user", "content": user_prompt}
|
| 197 |
+
],
|
| 198 |
+
model="llama-3.1-70b-versatile",
|
| 199 |
+
temperature=0.7,
|
| 200 |
+
max_tokens=1024
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
response = completion.choices[0].message.content.strip()
|
| 204 |
+
# Parse recommendations (assuming they're returned as a list)
|
| 205 |
+
recommendations = response.split('\n')
|
| 206 |
+
recommendations = [rec.strip('- ').strip() for rec in recommendations if rec.strip()]
|
| 207 |
+
return recommendations[:4] # Limit to 4 recommendations
|
| 208 |
+
|
| 209 |
+
except Exception as e:
|
| 210 |
+
print(f"Error generating recommendations: {e}")
|
| 211 |
+
return self._get_mock_recommendations()
|
| 212 |
+
|
| 213 |
+
def _get_mock_recommendations(self):
|
| 214 |
+
"""Fallback mock recommendations"""
|
| 215 |
+
return [
|
| 216 |
+
"Focus on highlighting positive feedback themes in marketing campaigns",
|
| 217 |
+
"Address common quality concerns mentioned in negative reviews",
|
| 218 |
+
"Improve delivery speed to enhance customer satisfaction",
|
| 219 |
+
"Implement proactive customer service for better experience"
|
| 220 |
+
]
|
| 221 |
+
|
| 222 |
+
def _create_sentiment_plots(self, results, sentiment_counts):
|
| 223 |
+
"""Create sentiment analysis visualizations"""
|
| 224 |
+
plots = []
|
| 225 |
+
|
| 226 |
+
# Overall sentiment pie chart
|
| 227 |
+
plt.figure(figsize=(10, 8))
|
| 228 |
+
labels = ['Positive', 'Neutral', 'Negative']
|
| 229 |
+
sizes = [results['overall']['positive'],
|
| 230 |
+
results['overall']['neutral'],
|
| 231 |
+
results['overall']['negative']]
|
| 232 |
+
colors = ['#2ecc71', '#95a5a6', '#e74c3c']
|
| 233 |
+
|
| 234 |
+
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
| 235 |
+
plt.title('Overall Sentiment Distribution')
|
| 236 |
+
plt.axis('equal')
|
| 237 |
+
plt.tight_layout()
|
| 238 |
+
plt.savefig('sentiment_pie_chart.png', dpi=300, bbox_inches='tight')
|
| 239 |
+
plots.append('sentiment_pie_chart.png')
|
| 240 |
+
plt.close()
|
| 241 |
+
|
| 242 |
+
# Sentiment by category bar chart
|
| 243 |
+
if len(results['byCategory']) > 1:
|
| 244 |
+
plt.figure(figsize=(12, 8))
|
| 245 |
+
categories = [cat['category'] for cat in results['byCategory']]
|
| 246 |
+
positive_vals = [cat['positive'] for cat in results['byCategory']]
|
| 247 |
+
neutral_vals = [cat['neutral'] for cat in results['byCategory']]
|
| 248 |
+
negative_vals = [cat['negative'] for cat in results['byCategory']]
|
| 249 |
+
|
| 250 |
+
x = np.arange(len(categories))
|
| 251 |
+
width = 0.25
|
| 252 |
+
|
| 253 |
+
plt.bar(x - width, positive_vals, width, label='Positive', color='#2ecc71')
|
| 254 |
+
plt.bar(x, neutral_vals, width, label='Neutral', color='#95a5a6')
|
| 255 |
+
plt.bar(x + width, negative_vals, width, label='Negative', color='#e74c3c')
|
| 256 |
+
|
| 257 |
+
plt.xlabel('Product Category')
|
| 258 |
+
plt.ylabel('Sentiment Proportion')
|
| 259 |
+
plt.title('Sentiment Analysis by Product Category')
|
| 260 |
+
plt.xticks(x, categories, rotation=45)
|
| 261 |
+
plt.legend()
|
| 262 |
+
plt.tight_layout()
|
| 263 |
+
plt.savefig('sentiment_by_category.png', dpi=300, bbox_inches='tight')
|
| 264 |
+
plots.append('sentiment_by_category.png')
|
| 265 |
+
plt.close()
|
| 266 |
+
|
| 267 |
+
# Key phrases frequency chart
|
| 268 |
+
phrases = results['keyPhrases']
|
| 269 |
+
if phrases:
|
| 270 |
+
plt.figure(figsize=(12, 8))
|
| 271 |
+
|
| 272 |
+
# Separate positive and negative phrases
|
| 273 |
+
pos_phrases = [p for p in phrases if p['sentiment'] == 'positive']
|
| 274 |
+
neg_phrases = [p for p in phrases if p['sentiment'] == 'negative']
|
| 275 |
+
|
| 276 |
+
if pos_phrases and neg_phrases:
|
| 277 |
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
|
| 278 |
+
|
| 279 |
+
# Positive phrases
|
| 280 |
+
pos_labels = [p['phrase'] for p in pos_phrases]
|
| 281 |
+
pos_freqs = [p['frequency'] for p in pos_phrases]
|
| 282 |
+
ax1.barh(pos_labels, pos_freqs, color='#2ecc71')
|
| 283 |
+
ax1.set_title('Most Frequent Positive Phrases')
|
| 284 |
+
ax1.set_xlabel('Frequency')
|
| 285 |
+
|
| 286 |
+
# Negative phrases
|
| 287 |
+
neg_labels = [p['phrase'] for p in neg_phrases]
|
| 288 |
+
neg_freqs = [p['frequency'] for p in neg_phrases]
|
| 289 |
+
ax2.barh(neg_labels, neg_freqs, color='#e74c3c')
|
| 290 |
+
ax2.set_title('Most Frequent Negative Phrases')
|
| 291 |
+
ax2.set_xlabel('Frequency')
|
| 292 |
+
|
| 293 |
+
plt.tight_layout()
|
| 294 |
+
plt.savefig('key_phrases_frequency.png', dpi=300, bbox_inches='tight')
|
| 295 |
+
plots.append('key_phrases_frequency.png')
|
| 296 |
+
plt.close()
|
| 297 |
+
|
| 298 |
+
return plots
|
trend_analyzer.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
from datetime import datetime, timedelta
|
| 6 |
+
from scipy import stats
|
| 7 |
+
from sklearn.linear_model import LinearRegression
|
| 8 |
+
import warnings
|
| 9 |
+
warnings.filterwarnings('ignore')
|
| 10 |
+
|
| 11 |
+
class TrendAnalyzer:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
def analyze(self, df, timeframe='Monthly'):
|
| 16 |
+
"""Analyze trends in the data"""
|
| 17 |
+
results = {}
|
| 18 |
+
plots = []
|
| 19 |
+
|
| 20 |
+
# Create synthetic time series data if no date column exists
|
| 21 |
+
df_with_time = self._add_time_dimension(df, timeframe)
|
| 22 |
+
|
| 23 |
+
# Analyze trends for numeric columns
|
| 24 |
+
numeric_cols = df_with_time.select_dtypes(include=[np.number]).columns
|
| 25 |
+
numeric_cols = [col for col in numeric_cols if col not in ['ID', 'time_period']]
|
| 26 |
+
|
| 27 |
+
trends = []
|
| 28 |
+
for column in numeric_cols[:4]: # Limit to first 4 numeric columns
|
| 29 |
+
trend_data = self._analyze_column_trend(df_with_time, column, timeframe)
|
| 30 |
+
trends.append(trend_data)
|
| 31 |
+
|
| 32 |
+
# Create trend plot
|
| 33 |
+
plt.figure(figsize=(12, 6))
|
| 34 |
+
plt.plot(df_with_time['time_period'], df_with_time[column],
|
| 35 |
+
marker='o', linewidth=2, markersize=4)
|
| 36 |
+
|
| 37 |
+
# Add trend line
|
| 38 |
+
x_numeric = range(len(df_with_time))
|
| 39 |
+
z = np.polyfit(x_numeric, df_with_time[column], 1)
|
| 40 |
+
p = np.poly1d(z)
|
| 41 |
+
plt.plot(df_with_time['time_period'], p(x_numeric),
|
| 42 |
+
"r--", alpha=0.8, linewidth=2, label=f'Trend Line')
|
| 43 |
+
|
| 44 |
+
plt.title(f'{column} Trend Analysis ({timeframe})')
|
| 45 |
+
plt.xlabel('Time Period')
|
| 46 |
+
plt.ylabel(column)
|
| 47 |
+
plt.xticks(rotation=45)
|
| 48 |
+
plt.legend()
|
| 49 |
+
plt.grid(True, alpha=0.3)
|
| 50 |
+
plt.tight_layout()
|
| 51 |
+
|
| 52 |
+
plot_name = f'{column.lower().replace(" ", "_")}_trend.png'
|
| 53 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 54 |
+
plots.append(plot_name)
|
| 55 |
+
plt.close()
|
| 56 |
+
|
| 57 |
+
results['trends'] = trends
|
| 58 |
+
results['timeframe'] = timeframe
|
| 59 |
+
|
| 60 |
+
# Seasonality analysis
|
| 61 |
+
seasonality_results = self._detect_seasonality(df_with_time, numeric_cols[:2])
|
| 62 |
+
results['seasonality'] = seasonality_results
|
| 63 |
+
|
| 64 |
+
# Forecasting
|
| 65 |
+
forecasts = self._generate_forecasts(df_with_time, numeric_cols[0] if numeric_cols else None)
|
| 66 |
+
results['forecasts'] = forecasts
|
| 67 |
+
|
| 68 |
+
# Create seasonality plot
|
| 69 |
+
if seasonality_results['detected']:
|
| 70 |
+
self._create_seasonality_plot(df_with_time, numeric_cols[0] if numeric_cols else None)
|
| 71 |
+
plots.append('seasonality_analysis.png')
|
| 72 |
+
|
| 73 |
+
# Create forecast plot
|
| 74 |
+
self._create_forecast_plot(df_with_time, numeric_cols[0] if numeric_cols else None, forecasts)
|
| 75 |
+
plots.append('forecast_plot.png')
|
| 76 |
+
|
| 77 |
+
return results, plots
|
| 78 |
+
|
| 79 |
+
def _add_time_dimension(self, df, timeframe):
|
| 80 |
+
"""Add synthetic time dimension to data"""
|
| 81 |
+
df_time = df.copy()
|
| 82 |
+
|
| 83 |
+
# Create time periods based on data length
|
| 84 |
+
n_periods = len(df)
|
| 85 |
+
|
| 86 |
+
if timeframe == 'Weekly':
|
| 87 |
+
start_date = datetime.now() - timedelta(weeks=n_periods)
|
| 88 |
+
time_periods = [start_date + timedelta(weeks=i) for i in range(n_periods)]
|
| 89 |
+
elif timeframe == 'Monthly':
|
| 90 |
+
start_date = datetime.now() - timedelta(days=30*n_periods)
|
| 91 |
+
time_periods = [start_date + timedelta(days=30*i) for i in range(n_periods)]
|
| 92 |
+
elif timeframe == 'Quarterly':
|
| 93 |
+
start_date = datetime.now() - timedelta(days=90*n_periods)
|
| 94 |
+
time_periods = [start_date + timedelta(days=90*i) for i in range(n_periods)]
|
| 95 |
+
else: # Yearly
|
| 96 |
+
start_date = datetime.now() - timedelta(days=365*n_periods)
|
| 97 |
+
time_periods = [start_date + timedelta(days=365*i) for i in range(n_periods)]
|
| 98 |
+
|
| 99 |
+
df_time['time_period'] = time_periods
|
| 100 |
+
|
| 101 |
+
# Sort by time
|
| 102 |
+
df_time = df_time.sort_values('time_period').reset_index(drop=True)
|
| 103 |
+
|
| 104 |
+
return df_time
|
| 105 |
+
|
| 106 |
+
def _analyze_column_trend(self, df, column, timeframe):
|
| 107 |
+
"""Analyze trend for a specific column"""
|
| 108 |
+
values = df[column].values
|
| 109 |
+
x = np.arange(len(values))
|
| 110 |
+
|
| 111 |
+
# Linear regression for trend
|
| 112 |
+
slope, intercept, r_value, p_value, std_err = stats.linregress(x, values)
|
| 113 |
+
|
| 114 |
+
# Determine trend direction and significance
|
| 115 |
+
if p_value < 0.05: # Statistically significant
|
| 116 |
+
if slope > 0:
|
| 117 |
+
direction = 'up'
|
| 118 |
+
significance = 'high' if abs(r_value) > 0.7 else 'medium'
|
| 119 |
+
else:
|
| 120 |
+
direction = 'down'
|
| 121 |
+
significance = 'high' if abs(r_value) > 0.7 else 'medium'
|
| 122 |
+
else:
|
| 123 |
+
direction = 'stable'
|
| 124 |
+
significance = 'low'
|
| 125 |
+
|
| 126 |
+
# Calculate percentage change
|
| 127 |
+
if len(values) > 1:
|
| 128 |
+
pct_change = ((values[-1] - values[0]) / values[0]) * 100
|
| 129 |
+
else:
|
| 130 |
+
pct_change = 0
|
| 131 |
+
|
| 132 |
+
return {
|
| 133 |
+
'metric': column,
|
| 134 |
+
'direction': direction,
|
| 135 |
+
'change': round(pct_change, 1),
|
| 136 |
+
'significance': significance,
|
| 137 |
+
'description': f'{direction.capitalize()} trend in {column} with {significance} significance',
|
| 138 |
+
'slope': round(slope, 4),
|
| 139 |
+
'r_squared': round(r_value**2, 3),
|
| 140 |
+
'p_value': round(p_value, 4)
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
def _detect_seasonality(self, df, columns):
|
| 144 |
+
"""Detect seasonality patterns"""
|
| 145 |
+
if not columns:
|
| 146 |
+
return {'detected': False}
|
| 147 |
+
|
| 148 |
+
# Simple seasonality detection using autocorrelation
|
| 149 |
+
column = columns[0]
|
| 150 |
+
values = df[column].values
|
| 151 |
+
|
| 152 |
+
if len(values) < 12: # Need at least 12 points for seasonality
|
| 153 |
+
return {'detected': False}
|
| 154 |
+
|
| 155 |
+
# Calculate autocorrelation at different lags
|
| 156 |
+
autocorr_values = []
|
| 157 |
+
for lag in range(1, min(len(values)//2, 12)):
|
| 158 |
+
if len(values) > lag:
|
| 159 |
+
autocorr = np.corrcoef(values[:-lag], values[lag:])[0, 1]
|
| 160 |
+
if not np.isnan(autocorr):
|
| 161 |
+
autocorr_values.append(abs(autocorr))
|
| 162 |
+
|
| 163 |
+
if autocorr_values:
|
| 164 |
+
max_autocorr = max(autocorr_values)
|
| 165 |
+
if max_autocorr > 0.5: # Threshold for seasonality
|
| 166 |
+
return {
|
| 167 |
+
'detected': True,
|
| 168 |
+
'pattern': 'quarterly', # Simplified assumption
|
| 169 |
+
'strength': round(max_autocorr, 2)
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
return {'detected': False}
|
| 173 |
+
|
| 174 |
+
def _generate_forecasts(self, df, column):
|
| 175 |
+
"""Generate simple forecasts"""
|
| 176 |
+
if not column or len(df) < 3:
|
| 177 |
+
return []
|
| 178 |
+
|
| 179 |
+
values = df[column].values
|
| 180 |
+
x = np.arange(len(values))
|
| 181 |
+
|
| 182 |
+
# Fit linear regression
|
| 183 |
+
model = LinearRegression()
|
| 184 |
+
model.fit(x.reshape(-1, 1), values)
|
| 185 |
+
|
| 186 |
+
# Generate forecasts
|
| 187 |
+
future_periods = [len(values), len(values) + 3, len(values) + 12]
|
| 188 |
+
forecasts = []
|
| 189 |
+
|
| 190 |
+
for period in future_periods:
|
| 191 |
+
prediction = model.predict([[period]])[0]
|
| 192 |
+
|
| 193 |
+
# Calculate confidence (simplified)
|
| 194 |
+
residuals = values - model.predict(x.reshape(-1, 1))
|
| 195 |
+
std_error = np.std(residuals)
|
| 196 |
+
confidence = max(0.5, 1 - (std_error / np.mean(values)))
|
| 197 |
+
|
| 198 |
+
if period == len(values):
|
| 199 |
+
period_name = 'Next Period'
|
| 200 |
+
elif period == len(values) + 3:
|
| 201 |
+
period_name = 'Next Quarter'
|
| 202 |
+
else:
|
| 203 |
+
period_name = 'Next Year'
|
| 204 |
+
|
| 205 |
+
forecasts.append({
|
| 206 |
+
'period': period_name,
|
| 207 |
+
'value': round(prediction, 1),
|
| 208 |
+
'confidence': round(confidence, 2)
|
| 209 |
+
})
|
| 210 |
+
|
| 211 |
+
return forecasts
|
| 212 |
+
|
| 213 |
+
def _create_seasonality_plot(self, df, column):
|
| 214 |
+
"""Create seasonality analysis plot"""
|
| 215 |
+
if not column:
|
| 216 |
+
return
|
| 217 |
+
|
| 218 |
+
plt.figure(figsize=(12, 8))
|
| 219 |
+
|
| 220 |
+
# Original series
|
| 221 |
+
plt.subplot(2, 1, 1)
|
| 222 |
+
plt.plot(df['time_period'], df[column], marker='o', linewidth=2)
|
| 223 |
+
plt.title(f'{column} Time Series')
|
| 224 |
+
plt.xlabel('Time')
|
| 225 |
+
plt.ylabel(column)
|
| 226 |
+
plt.xticks(rotation=45)
|
| 227 |
+
plt.grid(True, alpha=0.3)
|
| 228 |
+
|
| 229 |
+
# Moving average
|
| 230 |
+
plt.subplot(2, 1, 2)
|
| 231 |
+
window_size = min(12, len(df) // 4)
|
| 232 |
+
if window_size >= 2:
|
| 233 |
+
moving_avg = df[column].rolling(window=window_size).mean()
|
| 234 |
+
plt.plot(df['time_period'], df[column], alpha=0.5, label='Original')
|
| 235 |
+
plt.plot(df['time_period'], moving_avg, linewidth=2, label=f'{window_size}-period Moving Average')
|
| 236 |
+
plt.legend()
|
| 237 |
+
else:
|
| 238 |
+
plt.plot(df['time_period'], df[column], marker='o', linewidth=2)
|
| 239 |
+
|
| 240 |
+
plt.title(f'{column} with Trend')
|
| 241 |
+
plt.xlabel('Time')
|
| 242 |
+
plt.ylabel(column)
|
| 243 |
+
plt.xticks(rotation=45)
|
| 244 |
+
plt.grid(True, alpha=0.3)
|
| 245 |
+
|
| 246 |
+
plt.tight_layout()
|
| 247 |
+
plt.savefig('seasonality_analysis.png', dpi=300, bbox_inches='tight')
|
| 248 |
+
plt.close()
|
| 249 |
+
|
| 250 |
+
def _create_forecast_plot(self, df, column, forecasts):
|
| 251 |
+
"""Create forecast visualization"""
|
| 252 |
+
if not column or not forecasts:
|
| 253 |
+
# Create a simple placeholder plot
|
| 254 |
+
plt.figure(figsize=(10, 6))
|
| 255 |
+
plt.text(0.5, 0.5, 'Forecast Plot\n(Insufficient data for detailed forecasting)',
|
| 256 |
+
ha='center', va='center', transform=plt.gca().transAxes, fontsize=14)
|
| 257 |
+
plt.title('Revenue Forecast')
|
| 258 |
+
plt.savefig('forecast_plot.png', dpi=300, bbox_inches='tight')
|
| 259 |
+
plt.close()
|
| 260 |
+
return
|
| 261 |
+
|
| 262 |
+
plt.figure(figsize=(12, 8))
|
| 263 |
+
|
| 264 |
+
# Historical data
|
| 265 |
+
plt.plot(range(len(df)), df[column], marker='o', linewidth=2, label='Historical Data')
|
| 266 |
+
|
| 267 |
+
# Forecast points
|
| 268 |
+
forecast_x = [len(df), len(df) + 3, len(df) + 12]
|
| 269 |
+
forecast_y = [f['value'] for f in forecasts]
|
| 270 |
+
|
| 271 |
+
plt.plot(forecast_x, forecast_y, marker='s', linewidth=2,
|
| 272 |
+
linestyle='--', color='red', label='Forecast')
|
| 273 |
+
|
| 274 |
+
# Confidence intervals (simplified)
|
| 275 |
+
for i, (x, y, forecast) in enumerate(zip(forecast_x, forecast_y, forecasts)):
|
| 276 |
+
error = y * (1 - forecast['confidence']) * 0.5
|
| 277 |
+
plt.errorbar(x, y, yerr=error, color='red', alpha=0.5, capsize=5)
|
| 278 |
+
|
| 279 |
+
plt.title(f'{column} Forecast')
|
| 280 |
+
plt.xlabel('Time Period')
|
| 281 |
+
plt.ylabel(column)
|
| 282 |
+
plt.legend()
|
| 283 |
+
plt.grid(True, alpha=0.3)
|
| 284 |
+
plt.tight_layout()
|
| 285 |
+
plt.savefig('forecast_plot.png', dpi=300, bbox_inches='tight')
|
| 286 |
+
plt.close()
|
variable_extraction.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from groq import Groq
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
class VariableExtractor:
|
| 6 |
+
def __init__(self):
|
| 7 |
+
self.client = None
|
| 8 |
+
|
| 9 |
+
def set_api_key(self, api_key):
|
| 10 |
+
"""Set Groq API key"""
|
| 11 |
+
self.client = Groq(api_key=api_key)
|
| 12 |
+
|
| 13 |
+
def extract_variables(self, business_problem):
|
| 14 |
+
"""Extract relevant variables from business problem description"""
|
| 15 |
+
if not self.client:
|
| 16 |
+
# Fallback to mock data if no API key
|
| 17 |
+
return self._get_mock_variables()
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
system_prompt = """You are an expert business analyst. Extract relevant variables for marketing analysis from the given business problem. Return only a JSON array of variable names, nothing else."""
|
| 21 |
+
|
| 22 |
+
user_prompt = f"""Business Problem: {business_problem}
|
| 23 |
+
|
| 24 |
+
Extract 6-10 relevant variables that would be important for analyzing this marketing/business problem. Focus on measurable, actionable variables.
|
| 25 |
+
|
| 26 |
+
Return format: ["Variable 1", "Variable 2", "Variable 3", ...]"""
|
| 27 |
+
|
| 28 |
+
completion = self.client.chat.completions.create(
|
| 29 |
+
messages=[
|
| 30 |
+
{"role": "system", "content": system_prompt},
|
| 31 |
+
{"role": "user", "content": user_prompt}
|
| 32 |
+
],
|
| 33 |
+
model="llama-3.1-70b-versatile",
|
| 34 |
+
temperature=0.7,
|
| 35 |
+
max_tokens=1024
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
response = completion.choices[0].message.content.strip()
|
| 39 |
+
variables = json.loads(response)
|
| 40 |
+
return variables
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
print(f"Error extracting variables: {e}")
|
| 44 |
+
return self._get_mock_variables()
|
| 45 |
+
|
| 46 |
+
def _get_mock_variables(self):
|
| 47 |
+
"""Fallback mock variables"""
|
| 48 |
+
return [
|
| 49 |
+
"Customer Age",
|
| 50 |
+
"Purchase Amount",
|
| 51 |
+
"Product Category",
|
| 52 |
+
"Marketing Channel",
|
| 53 |
+
"Customer Location",
|
| 54 |
+
"Purchase Frequency",
|
| 55 |
+
"Customer Satisfaction Score",
|
| 56 |
+
"Time to Purchase"
|
| 57 |
+
]
|
visualization_engine.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import matplotlib.pyplot as plt
|
| 2 |
+
import seaborn as sns
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
import plotly.graph_objects as go
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
class VisualizationEngine:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
plt.style.use('seaborn-v0_8')
|
| 11 |
+
self.color_palette = sns.color_palette("husl", 8)
|
| 12 |
+
|
| 13 |
+
def create_visualizations(self, df, selected_features):
|
| 14 |
+
"""Create various visualizations based on selected features"""
|
| 15 |
+
plots = []
|
| 16 |
+
|
| 17 |
+
if not selected_features:
|
| 18 |
+
selected_features = df.columns[:4] # Default to first 4 columns
|
| 19 |
+
|
| 20 |
+
for feature in selected_features:
|
| 21 |
+
if feature in df.columns and feature != 'ID':
|
| 22 |
+
if df[feature].dtype in ['int64', 'float64']:
|
| 23 |
+
# Numerical feature visualizations
|
| 24 |
+
plots.extend(self._create_numerical_plots(df, feature))
|
| 25 |
+
else:
|
| 26 |
+
# Categorical feature visualizations
|
| 27 |
+
plots.extend(self._create_categorical_plots(df, feature))
|
| 28 |
+
|
| 29 |
+
# Create comparison plots
|
| 30 |
+
if len(selected_features) >= 2:
|
| 31 |
+
plots.extend(self._create_comparison_plots(df, selected_features))
|
| 32 |
+
|
| 33 |
+
return plots
|
| 34 |
+
|
| 35 |
+
def _create_numerical_plots(self, df, feature):
|
| 36 |
+
"""Create plots for numerical features"""
|
| 37 |
+
plots = []
|
| 38 |
+
|
| 39 |
+
# Histogram
|
| 40 |
+
plt.figure(figsize=(10, 6))
|
| 41 |
+
plt.hist(df[feature], bins=30, alpha=0.7, color=self.color_palette[0], edgecolor='black')
|
| 42 |
+
plt.title(f'{feature} Distribution')
|
| 43 |
+
plt.xlabel(feature)
|
| 44 |
+
plt.ylabel('Frequency')
|
| 45 |
+
plt.grid(True, alpha=0.3)
|
| 46 |
+
plt.tight_layout()
|
| 47 |
+
plot_name = f'{feature.lower().replace(" ", "_")}_histogram.png'
|
| 48 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 49 |
+
plots.append(plot_name)
|
| 50 |
+
plt.close()
|
| 51 |
+
|
| 52 |
+
# Box plot
|
| 53 |
+
plt.figure(figsize=(8, 6))
|
| 54 |
+
plt.boxplot(df[feature], patch_artist=True,
|
| 55 |
+
boxprops=dict(facecolor=self.color_palette[1]))
|
| 56 |
+
plt.title(f'{feature} Box Plot')
|
| 57 |
+
plt.ylabel(feature)
|
| 58 |
+
plt.grid(True, alpha=0.3)
|
| 59 |
+
plt.tight_layout()
|
| 60 |
+
plot_name = f'{feature.lower().replace(" ", "_")}_boxplot.png'
|
| 61 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 62 |
+
plots.append(plot_name)
|
| 63 |
+
plt.close()
|
| 64 |
+
|
| 65 |
+
# Density plot
|
| 66 |
+
plt.figure(figsize=(10, 6))
|
| 67 |
+
df[feature].plot(kind='density', color=self.color_palette[2], linewidth=2)
|
| 68 |
+
plt.title(f'{feature} Density Plot')
|
| 69 |
+
plt.xlabel(feature)
|
| 70 |
+
plt.ylabel('Density')
|
| 71 |
+
plt.grid(True, alpha=0.3)
|
| 72 |
+
plt.tight_layout()
|
| 73 |
+
plot_name = f'{feature.lower().replace(" ", "_")}_density.png'
|
| 74 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 75 |
+
plots.append(plot_name)
|
| 76 |
+
plt.close()
|
| 77 |
+
|
| 78 |
+
return plots
|
| 79 |
+
|
| 80 |
+
def _create_categorical_plots(self, df, feature):
|
| 81 |
+
"""Create plots for categorical features"""
|
| 82 |
+
plots = []
|
| 83 |
+
|
| 84 |
+
value_counts = df[feature].value_counts()
|
| 85 |
+
|
| 86 |
+
# Bar plot
|
| 87 |
+
plt.figure(figsize=(12, 6))
|
| 88 |
+
bars = plt.bar(value_counts.index, value_counts.values,
|
| 89 |
+
color=self.color_palette[:len(value_counts)])
|
| 90 |
+
plt.title(f'{feature} Distribution')
|
| 91 |
+
plt.xlabel(feature)
|
| 92 |
+
plt.ylabel('Count')
|
| 93 |
+
plt.xticks(rotation=45)
|
| 94 |
+
|
| 95 |
+
# Add value labels on bars
|
| 96 |
+
for bar in bars:
|
| 97 |
+
height = bar.get_height()
|
| 98 |
+
plt.text(bar.get_x() + bar.get_width()/2., height,
|
| 99 |
+
f'{int(height)}', ha='center', va='bottom')
|
| 100 |
+
|
| 101 |
+
plt.tight_layout()
|
| 102 |
+
plot_name = f'{feature.lower().replace(" ", "_")}_barplot.png'
|
| 103 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 104 |
+
plots.append(plot_name)
|
| 105 |
+
plt.close()
|
| 106 |
+
|
| 107 |
+
# Pie chart
|
| 108 |
+
plt.figure(figsize=(10, 8))
|
| 109 |
+
plt.pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%',
|
| 110 |
+
colors=self.color_palette[:len(value_counts)])
|
| 111 |
+
plt.title(f'{feature} Distribution (Pie Chart)')
|
| 112 |
+
plt.tight_layout()
|
| 113 |
+
plot_name = f'{feature.lower().replace(" ", "_")}_piechart.png'
|
| 114 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 115 |
+
plots.append(plot_name)
|
| 116 |
+
plt.close()
|
| 117 |
+
|
| 118 |
+
return plots
|
| 119 |
+
|
| 120 |
+
def _create_comparison_plots(self, df, features):
|
| 121 |
+
"""Create comparison plots between features"""
|
| 122 |
+
plots = []
|
| 123 |
+
|
| 124 |
+
numeric_features = [f for f in features if df[f].dtype in ['int64', 'float64']]
|
| 125 |
+
categorical_features = [f for f in features if df[f].dtype in ['object', 'category']]
|
| 126 |
+
|
| 127 |
+
# Scatter plots for numeric features
|
| 128 |
+
if len(numeric_features) >= 2:
|
| 129 |
+
for i in range(len(numeric_features)):
|
| 130 |
+
for j in range(i+1, len(numeric_features)):
|
| 131 |
+
plt.figure(figsize=(10, 8))
|
| 132 |
+
plt.scatter(df[numeric_features[i]], df[numeric_features[j]],
|
| 133 |
+
alpha=0.6, color=self.color_palette[0])
|
| 134 |
+
plt.xlabel(numeric_features[i])
|
| 135 |
+
plt.ylabel(numeric_features[j])
|
| 136 |
+
plt.title(f'{numeric_features[i]} vs {numeric_features[j]}')
|
| 137 |
+
plt.grid(True, alpha=0.3)
|
| 138 |
+
plt.tight_layout()
|
| 139 |
+
plot_name = f'{numeric_features[i].lower().replace(" ", "_")}_vs_{numeric_features[j].lower().replace(" ", "_")}_scatter.png'
|
| 140 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 141 |
+
plots.append(plot_name)
|
| 142 |
+
plt.close()
|
| 143 |
+
|
| 144 |
+
# Box plots for numeric vs categorical
|
| 145 |
+
if numeric_features and categorical_features:
|
| 146 |
+
for num_feat in numeric_features[:2]: # Limit to avoid too many plots
|
| 147 |
+
for cat_feat in categorical_features[:2]:
|
| 148 |
+
plt.figure(figsize=(12, 8))
|
| 149 |
+
df.boxplot(column=num_feat, by=cat_feat, ax=plt.gca())
|
| 150 |
+
plt.title(f'{num_feat} by {cat_feat}')
|
| 151 |
+
plt.suptitle('') # Remove default title
|
| 152 |
+
plt.xticks(rotation=45)
|
| 153 |
+
plt.tight_layout()
|
| 154 |
+
plot_name = f'{num_feat.lower().replace(" ", "_")}_by_{cat_feat.lower().replace(" ", "_")}_boxplot.png'
|
| 155 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 156 |
+
plots.append(plot_name)
|
| 157 |
+
plt.close()
|
| 158 |
+
|
| 159 |
+
# Correlation heatmap for numeric features
|
| 160 |
+
if len(numeric_features) >= 2:
|
| 161 |
+
plt.figure(figsize=(10, 8))
|
| 162 |
+
correlation_matrix = df[numeric_features].corr()
|
| 163 |
+
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
|
| 164 |
+
square=True, linewidths=0.5)
|
| 165 |
+
plt.title('Feature Correlation Matrix')
|
| 166 |
+
plt.tight_layout()
|
| 167 |
+
plot_name = 'selected_features_correlation.png'
|
| 168 |
+
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
|
| 169 |
+
plots.append(plot_name)
|
| 170 |
+
plt.close()
|
| 171 |
+
|
| 172 |
+
return plots
|
| 173 |
+
|
| 174 |
+
def create_interactive_plots(self, df, features):
|
| 175 |
+
"""Create interactive Plotly visualizations"""
|
| 176 |
+
plots = []
|
| 177 |
+
|
| 178 |
+
for feature in features:
|
| 179 |
+
if feature in df.columns and feature != 'ID':
|
| 180 |
+
if df[feature].dtype in ['int64', 'float64']:
|
| 181 |
+
# Interactive histogram
|
| 182 |
+
fig = px.histogram(df, x=feature, title=f'{feature} Distribution')
|
| 183 |
+
fig.write_html(f'{feature.lower().replace(" ", "_")}_interactive_hist.html')
|
| 184 |
+
plots.append(f'{feature.lower().replace(" ", "_")}_interactive_hist.html')
|
| 185 |
+
else:
|
| 186 |
+
# Interactive bar chart
|
| 187 |
+
value_counts = df[feature].value_counts()
|
| 188 |
+
fig = px.bar(x=value_counts.index, y=value_counts.values,
|
| 189 |
+
title=f'{feature} Distribution')
|
| 190 |
+
fig.write_html(f'{feature.lower().replace(" ", "_")}_interactive_bar.html')
|
| 191 |
+
plots.append(f'{feature.lower().replace(" ", "_")}_interactive_bar.html')
|
| 192 |
+
|
| 193 |
+
return plots
|