Spaces:
Configuration error
Configuration error
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from scipy.stats import pearsonr, spearmanr | |
| import json | |
| from groq import Groq | |
| import os | |
| class EDAAnalyzer: | |
| def __init__(self): | |
| self.client = None | |
| plt.style.use('seaborn-v0_8') | |
| def set_api_key(self, api_key): | |
| """Set Groq API key""" | |
| self.client = Groq(api_key=api_key) | |
| def analyze(self, df): | |
| """Perform comprehensive EDA""" | |
| results = {} | |
| plots = [] | |
| # Basic statistics | |
| results['summary'] = { | |
| 'total_records': len(df), | |
| 'total_features': len(df.columns), | |
| 'numerical_features': len(df.select_dtypes(include=[np.number]).columns), | |
| 'categorical_features': len(df.select_dtypes(include=['object', 'category']).columns), | |
| 'missing_values': df.isnull().sum().sum() | |
| } | |
| # Correlation analysis | |
| numeric_df = df.select_dtypes(include=[np.number]) | |
| if len(numeric_df.columns) > 1: | |
| correlation_matrix = numeric_df.corr() | |
| results['correlations'] = self._extract_strong_correlations(correlation_matrix) | |
| # Create correlation heatmap | |
| plt.figure(figsize=(10, 8)) | |
| sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) | |
| plt.title('Feature Correlation Matrix') | |
| plt.tight_layout() | |
| plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight') | |
| plots.append('correlation_heatmap.png') | |
| plt.close() | |
| # Distribution analysis | |
| results['distributions'] = {} | |
| for column in numeric_df.columns: | |
| if column != 'ID': | |
| stats = { | |
| 'mean': round(numeric_df[column].mean(), 2), | |
| 'std': round(numeric_df[column].std(), 2), | |
| 'min': round(numeric_df[column].min(), 2), | |
| 'max': round(numeric_df[column].max(), 2), | |
| 'median': round(numeric_df[column].median(), 2), | |
| 'skewness': round(numeric_df[column].skew(), 2) | |
| } | |
| results['distributions'][column] = stats | |
| # Create distribution plot | |
| plt.figure(figsize=(10, 6)) | |
| plt.subplot(1, 2, 1) | |
| plt.hist(numeric_df[column], bins=30, alpha=0.7, edgecolor='black') | |
| plt.title(f'{column} Distribution') | |
| plt.xlabel(column) | |
| plt.ylabel('Frequency') | |
| plt.subplot(1, 2, 2) | |
| plt.boxplot(numeric_df[column]) | |
| plt.title(f'{column} Box Plot') | |
| plt.ylabel(column) | |
| plt.tight_layout() | |
| plot_name = f'{column.lower().replace(" ", "_")}_distribution.png' | |
| plt.savefig(plot_name, dpi=300, bbox_inches='tight') | |
| plots.append(plot_name) | |
| plt.close() | |
| # Categorical analysis | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
| for column in categorical_cols: | |
| if column != 'ID': | |
| value_counts = df[column].value_counts() | |
| # Create bar plot | |
| plt.figure(figsize=(10, 6)) | |
| value_counts.plot(kind='bar') | |
| plt.title(f'{column} Distribution') | |
| plt.xlabel(column) | |
| plt.ylabel('Count') | |
| plt.xticks(rotation=45) | |
| plt.tight_layout() | |
| plot_name = f'{column.lower().replace(" ", "_")}_distribution.png' | |
| plt.savefig(plot_name, dpi=300, bbox_inches='tight') | |
| plots.append(plot_name) | |
| plt.close() | |
| # Generate AI insights | |
| results['insights'] = self._generate_insights(df, results) | |
| return results, plots | |
| def _extract_strong_correlations(self, corr_matrix, threshold=0.5): | |
| """Extract correlations above threshold""" | |
| strong_correlations = [] | |
| for i in range(len(corr_matrix.columns)): | |
| for j in range(i+1, len(corr_matrix.columns)): | |
| corr_value = corr_matrix.iloc[i, j] | |
| if abs(corr_value) >= threshold: | |
| strong_correlations.append({ | |
| 'var1': corr_matrix.columns[i], | |
| 'var2': corr_matrix.columns[j], | |
| 'correlation': round(corr_value, 3) | |
| }) | |
| return strong_correlations | |
| def _generate_insights(self, df, results): | |
| """Generate AI-powered insights""" | |
| if not self.client: | |
| return self._get_mock_insights() | |
| try: | |
| # Prepare data summary for AI | |
| data_summary = { | |
| 'columns': list(df.columns), | |
| 'shape': df.shape, | |
| 'correlations': results.get('correlations', []), | |
| 'distributions': results.get('distributions', {}) | |
| } | |
| system_prompt = """You are a data scientist analyzing marketing data. Generate 3-5 key insights based on the data summary provided. Focus on actionable business insights.""" | |
| user_prompt = f"""Data Summary: {json.dumps(data_summary, indent=2)} | |
| Generate key insights about this marketing dataset. Focus on: | |
| 1. Customer behavior patterns | |
| 2. Important correlations | |
| 3. Distribution characteristics | |
| 4. Business implications | |
| Return insights as a JSON array of strings.""" | |
| completion = self.client.chat.completions.create( | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ], | |
| model="llama-3.1-70b-versatile", | |
| temperature=0.7, | |
| max_tokens=1024 | |
| ) | |
| response = completion.choices[0].message.content.strip() | |
| insights = json.loads(response) | |
| return insights | |
| except Exception as e: | |
| print(f"Error generating insights: {e}") | |
| return self._get_mock_insights() | |
| def _get_mock_insights(self): | |
| """Fallback mock insights""" | |
| return [ | |
| "Strong correlation patterns detected between customer demographics and purchase behavior", | |
| "Customer age distribution shows normal pattern with peak in 30-40 age range", | |
| "Purchase amounts vary significantly across different product categories", | |
| "Marketing channel effectiveness differs by customer segment", | |
| "Seasonal patterns visible in customer engagement metrics" | |
| ] |