BI_ANALYTICS / eda_analyzer.py
ratulsur's picture
Upload 13 files
98bc1c2 verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import pearsonr, spearmanr
import json
from groq import Groq
import os
class EDAAnalyzer:
def __init__(self):
self.client = None
plt.style.use('seaborn-v0_8')
def set_api_key(self, api_key):
"""Set Groq API key"""
self.client = Groq(api_key=api_key)
def analyze(self, df):
"""Perform comprehensive EDA"""
results = {}
plots = []
# Basic statistics
results['summary'] = {
'total_records': len(df),
'total_features': len(df.columns),
'numerical_features': len(df.select_dtypes(include=[np.number]).columns),
'categorical_features': len(df.select_dtypes(include=['object', 'category']).columns),
'missing_values': df.isnull().sum().sum()
}
# Correlation analysis
numeric_df = df.select_dtypes(include=[np.number])
if len(numeric_df.columns) > 1:
correlation_matrix = numeric_df.corr()
results['correlations'] = self._extract_strong_correlations(correlation_matrix)
# Create correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plots.append('correlation_heatmap.png')
plt.close()
# Distribution analysis
results['distributions'] = {}
for column in numeric_df.columns:
if column != 'ID':
stats = {
'mean': round(numeric_df[column].mean(), 2),
'std': round(numeric_df[column].std(), 2),
'min': round(numeric_df[column].min(), 2),
'max': round(numeric_df[column].max(), 2),
'median': round(numeric_df[column].median(), 2),
'skewness': round(numeric_df[column].skew(), 2)
}
results['distributions'][column] = stats
# Create distribution plot
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.hist(numeric_df[column], bins=30, alpha=0.7, edgecolor='black')
plt.title(f'{column} Distribution')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
plt.boxplot(numeric_df[column])
plt.title(f'{column} Box Plot')
plt.ylabel(column)
plt.tight_layout()
plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
plots.append(plot_name)
plt.close()
# Categorical analysis
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for column in categorical_cols:
if column != 'ID':
value_counts = df[column].value_counts()
# Create bar plot
plt.figure(figsize=(10, 6))
value_counts.plot(kind='bar')
plt.title(f'{column} Distribution')
plt.xlabel(column)
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
plt.savefig(plot_name, dpi=300, bbox_inches='tight')
plots.append(plot_name)
plt.close()
# Generate AI insights
results['insights'] = self._generate_insights(df, results)
return results, plots
def _extract_strong_correlations(self, corr_matrix, threshold=0.5):
"""Extract correlations above threshold"""
strong_correlations = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_value = corr_matrix.iloc[i, j]
if abs(corr_value) >= threshold:
strong_correlations.append({
'var1': corr_matrix.columns[i],
'var2': corr_matrix.columns[j],
'correlation': round(corr_value, 3)
})
return strong_correlations
def _generate_insights(self, df, results):
"""Generate AI-powered insights"""
if not self.client:
return self._get_mock_insights()
try:
# Prepare data summary for AI
data_summary = {
'columns': list(df.columns),
'shape': df.shape,
'correlations': results.get('correlations', []),
'distributions': results.get('distributions', {})
}
system_prompt = """You are a data scientist analyzing marketing data. Generate 3-5 key insights based on the data summary provided. Focus on actionable business insights."""
user_prompt = f"""Data Summary: {json.dumps(data_summary, indent=2)}
Generate key insights about this marketing dataset. Focus on:
1. Customer behavior patterns
2. Important correlations
3. Distribution characteristics
4. Business implications
Return insights as a JSON array of strings."""
completion = self.client.chat.completions.create(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
model="llama-3.1-70b-versatile",
temperature=0.7,
max_tokens=1024
)
response = completion.choices[0].message.content.strip()
insights = json.loads(response)
return insights
except Exception as e:
print(f"Error generating insights: {e}")
return self._get_mock_insights()
def _get_mock_insights(self):
"""Fallback mock insights"""
return [
"Strong correlation patterns detected between customer demographics and purchase behavior",
"Customer age distribution shows normal pattern with peak in 30-40 age range",
"Purchase amounts vary significantly across different product categories",
"Marketing channel effectiveness differs by customer segment",
"Seasonal patterns visible in customer engagement metrics"
]