Spaces:

ratulsur
/

BI_ANALYTICS

Configuration error

App Files Files Community

BI_ANALYTICS / eda_analyzer.py

ratulsur

Upload 13 files

98bc1c2 verified 8 months ago

raw

history blame contribute delete

6.95 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	import plotly.graph_objects as go
	from scipy.stats import pearsonr, spearmanr
	import json
	from groq import Groq
	import os

	class EDAAnalyzer:
	def __init__(self):
	self.client = None
	plt.style.use('seaborn-v0_8')

	def set_api_key(self, api_key):
	"""Set Groq API key"""
	self.client = Groq(api_key=api_key)

	def analyze(self, df):
	"""Perform comprehensive EDA"""
	results = {}
	plots = []

	# Basic statistics
	results['summary'] = {
	'total_records': len(df),
	'total_features': len(df.columns),
	'numerical_features': len(df.select_dtypes(include=[np.number]).columns),
	'categorical_features': len(df.select_dtypes(include=['object', 'category']).columns),
	'missing_values': df.isnull().sum().sum()
	}

	# Correlation analysis
	numeric_df = df.select_dtypes(include=[np.number])
	if len(numeric_df.columns) > 1:
	correlation_matrix = numeric_df.corr()
	results['correlations'] = self._extract_strong_correlations(correlation_matrix)

	# Create correlation heatmap
	plt.figure(figsize=(10, 8))
	sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
	plt.title('Feature Correlation Matrix')
	plt.tight_layout()
	plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
	plots.append('correlation_heatmap.png')
	plt.close()

	# Distribution analysis
	results['distributions'] = {}
	for column in numeric_df.columns:
	if column != 'ID':
	stats = {
	'mean': round(numeric_df[column].mean(), 2),
	'std': round(numeric_df[column].std(), 2),
	'min': round(numeric_df[column].min(), 2),
	'max': round(numeric_df[column].max(), 2),
	'median': round(numeric_df[column].median(), 2),
	'skewness': round(numeric_df[column].skew(), 2)
	}
	results['distributions'][column] = stats

	# Create distribution plot
	plt.figure(figsize=(10, 6))
	plt.subplot(1, 2, 1)
	plt.hist(numeric_df[column], bins=30, alpha=0.7, edgecolor='black')
	plt.title(f'{column} Distribution')
	plt.xlabel(column)
	plt.ylabel('Frequency')

	plt.subplot(1, 2, 2)
	plt.boxplot(numeric_df[column])
	plt.title(f'{column} Box Plot')
	plt.ylabel(column)

	plt.tight_layout()
	plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
	plt.savefig(plot_name, dpi=300, bbox_inches='tight')
	plots.append(plot_name)
	plt.close()

	# Categorical analysis
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns
	for column in categorical_cols:
	if column != 'ID':
	value_counts = df[column].value_counts()

	# Create bar plot
	plt.figure(figsize=(10, 6))
	value_counts.plot(kind='bar')
	plt.title(f'{column} Distribution')
	plt.xlabel(column)
	plt.ylabel('Count')
	plt.xticks(rotation=45)
	plt.tight_layout()
	plot_name = f'{column.lower().replace(" ", "_")}_distribution.png'
	plt.savefig(plot_name, dpi=300, bbox_inches='tight')
	plots.append(plot_name)
	plt.close()

	# Generate AI insights
	results['insights'] = self._generate_insights(df, results)

	return results, plots

	def _extract_strong_correlations(self, corr_matrix, threshold=0.5):
	"""Extract correlations above threshold"""
	strong_correlations = []

	for i in range(len(corr_matrix.columns)):
	for j in range(i+1, len(corr_matrix.columns)):
	corr_value = corr_matrix.iloc[i, j]
	if abs(corr_value) >= threshold:
	strong_correlations.append({
	'var1': corr_matrix.columns[i],
	'var2': corr_matrix.columns[j],
	'correlation': round(corr_value, 3)
	})

	return strong_correlations

	def _generate_insights(self, df, results):
	"""Generate AI-powered insights"""
	if not self.client:
	return self._get_mock_insights()

	try:
	# Prepare data summary for AI
	data_summary = {
	'columns': list(df.columns),
	'shape': df.shape,
	'correlations': results.get('correlations', []),
	'distributions': results.get('distributions', {})
	}

	system_prompt = """You are a data scientist analyzing marketing data. Generate 3-5 key insights based on the data summary provided. Focus on actionable business insights."""

	user_prompt = f"""Data Summary: {json.dumps(data_summary, indent=2)}

	Generate key insights about this marketing dataset. Focus on:
	1. Customer behavior patterns
	2. Important correlations
	3. Distribution characteristics
	4. Business implications

	Return insights as a JSON array of strings."""

	completion = self.client.chat.completions.create(
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	],
	model="llama-3.1-70b-versatile",
	temperature=0.7,
	max_tokens=1024
	)

	response = completion.choices[0].message.content.strip()
	insights = json.loads(response)
	return insights

	except Exception as e:
	print(f"Error generating insights: {e}")
	return self._get_mock_insights()

	def _get_mock_insights(self):
	"""Fallback mock insights"""
	return [
	"Strong correlation patterns detected between customer demographics and purchase behavior",
	"Customer age distribution shows normal pattern with peak in 30-40 age range",
	"Purchase amounts vary significantly across different product categories",
	"Marketing channel effectiveness differs by customer segment",
	"Seasonal patterns visible in customer engagement metrics"
	]