##________generate natural language insights from analysis_________## import os import json from typing import Dict, Any ##___________________________ class InsightGenerator: def __init__(self, use_openai=False, api_key=None): self.use_openai = use_openai if use_openai and api_key: import openai openai.api_key = api_key self.openai = openai else: print(" Using template-based insight generation") def generate_insights(self, df, schema, analysis): """generate human readable insights""" insights = [] # 1.dataset overview insights.append(f" **Dataset Overview**: Your dataset has {len(df)} rows and {len(df.columns)} columns.") # 2. Key statistics insights.extend(self._generate_statistical_insights(analysis['descriptive_stats'])) # 3. Correlation insights insights.extend(self._generate_correlation_insights(analysis['correlations'])) # 4. Trend insights insights.extend(self._generate_trend_insights(analysis['trends'])) # 5. Group analysis insights insights.extend(self._generate_group_insights(analysis['group_analysis'])) # 6. Outlier insights insights.extend(self._generate_outlier_insights(analysis['outliers'])) # 7. Distribution insights insights.extend(self._generate_distribution_insights(analysis['distributions'])) # 8. Actionable recommendations insights.extend(self._generate_recommendations(analysis)) return insights def _generate_statistical_insights(self, stats): """generate insights from descriptive statistics""" insights = [] for col, values in stats.items(): if values['mean'] > values['median'] * 1.2: insights.append(f" **{col}** is right-skewed (mean {values['mean']:.2f} > median {values['median']:.2f}), suggesting some high values pulling the average up.") elif values['median'] > values['mean'] * 1.2: insights.append(f" **{col}** is left-skewed (median {values['median']:.2f} > mean {values['mean']:.2f}).") return insights[:3] ### limit to top 3 def _generate_correlation_insights(self, correlations): """generate insights from correlations""" insights = [] for corr in correlations[:3]: # Top 3 correlations strength = "strong positive" if corr['strength'] == 'positive' else "strong negative" insights.append(f" **{corr['col1']}** and **{corr['col2']}** show a {strength} correlation ({corr['correlation']:.2f}).") if corr['strength'] == 'positive': insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to increase as well.") else: insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to decrease.") return insights def _generate_trend_insights(self, trends): """generate insights from trends""" insights =[] for trend in trends: direction = "increased" if trend['direction'] == 'increasing' else "decreased" change_abs = abs(trend['percent_change']) if change_abs > 20: insights.append(f" **{trend['column']}** has {direction} significantly by {change_abs:.1f}% over time.") elif change_abs > 5: insights.append(f" **{trend['column']}** has {direction} by {change_abs:.1f}% over the period.") return insights def _generate_group_insights(self, group_analysis): """generate insights from group analysis""" insights = [] for cat_col, analyses in group_analysis.items(): for num_col, analysis in analyses.items(): if analysis['top_category']: insights.append(f" **{analysis['top_category']}** is the top performer in {cat_col} for {num_col} with {analysis['top_value']:.2f}.") return insights[:3] def _generate_outlier_insights(self, outliers): """generate insights about outliers""" insights = [] for col, data in outliers.items(): if data['percentage'] < 5: insights.append(f" **{col}** contains {data['count']} outliers ({data['percentage']:.1f}% of data). These might be worth investigating.") return insights def _generate_distribution_insights(self, distributions): """generate insights about distributions""" insights = [] for col, dist in distributions.items(): if dist['shape'] != 'approximately normal': insights.append(f" **{col}** has a {dist['shape']} distribution (skewness: {dist['skewness']:.2f}).") return insights[:2] def _generate_recommendations(self, analysis): """generate actionable recommendations""" recommendations = [] # Check for opportunities if analysis['correlations']: strong_corr = analysis['correlations'][0] if strong_corr['strength'] == 'positive': recommendations.append(f" **Recommendation**: Focus on increasing {strong_corr['col1']} to potentially boost {strong_corr['col2']}.") # Check for declining trends for trend in analysis['trends']: if trend['direction'] == 'decreasing' and abs(trend['percent_change']) > 10: recommendations.append(f" **Action Required**: {trend['column']} is declining. Consider investigating causes.") break if not recommendations: recommendations.append(" **Status**: No urgent issues detected. Continue monitoring key metrics.") return recommendations def generate_openai_insights(self, df_summary, analysis): """use OpenAI to generate insights""" if not self.use_openai: return self.generate_insights(df_summary, analysis) prompt = f""" You are a data analyst. Analyze this dataset and provide key business insights: Dataset: {df_summary['rows']} rows, {df_summary['columns']} columns Columns: {df_summary['column_names']} Key Statistics: {analysis.get('descriptive_stats', {})} Correlations: {analysis.get('correlations', [])} Trends: {analysis.get('trends', [])} Provide: 1. Top 3 key findings 2. One actionable recommendation 3. One question the user should explore further Keep it concise and business-friendly. """ try: response = self.openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], max_tokens=300 ) return [response.choices[0].message.content] except Exception as e: print(f"OpenAI error: {e}") return self.generate_insights(df_summary, analysis)