Spaces:
Sleeping
Sleeping
| ##________generate natural language insights from analysis_________## | |
| import os | |
| import json | |
| from typing import Dict, Any | |
| ##___________________________ | |
| class InsightGenerator: | |
| def __init__(self, use_openai=False, api_key=None): | |
| self.use_openai = use_openai | |
| if use_openai and api_key: | |
| import openai | |
| openai.api_key = api_key | |
| self.openai = openai | |
| else: | |
| print(" Using template-based insight generation") | |
| def generate_insights(self, df, schema, analysis): | |
| """generate human readable insights""" | |
| insights = [] | |
| # 1.dataset overview | |
| insights.append(f" **Dataset Overview**: Your dataset has {len(df)} rows and {len(df.columns)} columns.") | |
| # 2. Key statistics | |
| insights.extend(self._generate_statistical_insights(analysis['descriptive_stats'])) | |
| # 3. Correlation insights | |
| insights.extend(self._generate_correlation_insights(analysis['correlations'])) | |
| # 4. Trend insights | |
| insights.extend(self._generate_trend_insights(analysis['trends'])) | |
| # 5. Group analysis insights | |
| insights.extend(self._generate_group_insights(analysis['group_analysis'])) | |
| # 6. Outlier insights | |
| insights.extend(self._generate_outlier_insights(analysis['outliers'])) | |
| # 7. Distribution insights | |
| insights.extend(self._generate_distribution_insights(analysis['distributions'])) | |
| # 8. Actionable recommendations | |
| insights.extend(self._generate_recommendations(analysis)) | |
| return insights | |
| def _generate_statistical_insights(self, stats): | |
| """generate insights from descriptive statistics""" | |
| insights = [] | |
| for col, values in stats.items(): | |
| if values['mean'] > values['median'] * 1.2: | |
| insights.append(f" **{col}** is right-skewed (mean {values['mean']:.2f} > median {values['median']:.2f}), suggesting some high values pulling the average up.") | |
| elif values['median'] > values['mean'] * 1.2: | |
| insights.append(f" **{col}** is left-skewed (median {values['median']:.2f} > mean {values['mean']:.2f}).") | |
| return insights[:3] ### limit to top 3 | |
| def _generate_correlation_insights(self, correlations): | |
| """generate insights from correlations""" | |
| insights = [] | |
| for corr in correlations[:3]: # Top 3 correlations | |
| strength = "strong positive" if corr['strength'] == 'positive' else "strong negative" | |
| insights.append(f" **{corr['col1']}** and **{corr['col2']}** show a {strength} correlation ({corr['correlation']:.2f}).") | |
| if corr['strength'] == 'positive': | |
| insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to increase as well.") | |
| else: | |
| insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to decrease.") | |
| return insights | |
| def _generate_trend_insights(self, trends): | |
| """generate insights from trends""" | |
| insights =[] | |
| for trend in trends: | |
| direction = "increased" if trend['direction'] == 'increasing' else "decreased" | |
| change_abs = abs(trend['percent_change']) | |
| if change_abs > 20: | |
| insights.append(f" **{trend['column']}** has {direction} significantly by {change_abs:.1f}% over time.") | |
| elif change_abs > 5: | |
| insights.append(f" **{trend['column']}** has {direction} by {change_abs:.1f}% over the period.") | |
| return insights | |
| def _generate_group_insights(self, group_analysis): | |
| """generate insights from group analysis""" | |
| insights = [] | |
| for cat_col, analyses in group_analysis.items(): | |
| for num_col, analysis in analyses.items(): | |
| if analysis['top_category']: | |
| insights.append(f" **{analysis['top_category']}** is the top performer in {cat_col} for {num_col} with {analysis['top_value']:.2f}.") | |
| return insights[:3] | |
| def _generate_outlier_insights(self, outliers): | |
| """generate insights about outliers""" | |
| insights = [] | |
| for col, data in outliers.items(): | |
| if data['percentage'] < 5: | |
| insights.append(f" **{col}** contains {data['count']} outliers ({data['percentage']:.1f}% of data). These might be worth investigating.") | |
| return insights | |
| def _generate_distribution_insights(self, distributions): | |
| """generate insights about distributions""" | |
| insights = [] | |
| for col, dist in distributions.items(): | |
| if dist['shape'] != 'approximately normal': | |
| insights.append(f" **{col}** has a {dist['shape']} distribution (skewness: {dist['skewness']:.2f}).") | |
| return insights[:2] | |
| def _generate_recommendations(self, analysis): | |
| """generate actionable recommendations""" | |
| recommendations = [] | |
| # Check for opportunities | |
| if analysis['correlations']: | |
| strong_corr = analysis['correlations'][0] | |
| if strong_corr['strength'] == 'positive': | |
| recommendations.append(f" **Recommendation**: Focus on increasing {strong_corr['col1']} to potentially boost {strong_corr['col2']}.") | |
| # Check for declining trends | |
| for trend in analysis['trends']: | |
| if trend['direction'] == 'decreasing' and abs(trend['percent_change']) > 10: | |
| recommendations.append(f" **Action Required**: {trend['column']} is declining. Consider investigating causes.") | |
| break | |
| if not recommendations: | |
| recommendations.append(" **Status**: No urgent issues detected. Continue monitoring key metrics.") | |
| return recommendations | |
| def generate_openai_insights(self, df_summary, analysis): | |
| """use OpenAI to generate insights""" | |
| if not self.use_openai: | |
| return self.generate_insights(df_summary, analysis) | |
| prompt = f""" | |
| You are a data analyst. Analyze this dataset and provide key business insights: | |
| Dataset: {df_summary['rows']} rows, {df_summary['columns']} columns | |
| Columns: {df_summary['column_names']} | |
| Key Statistics: {analysis.get('descriptive_stats', {})} | |
| Correlations: {analysis.get('correlations', [])} | |
| Trends: {analysis.get('trends', [])} | |
| Provide: | |
| 1. Top 3 key findings | |
| 2. One actionable recommendation | |
| 3. One question the user should explore further | |
| Keep it concise and business-friendly. | |
| """ | |
| try: | |
| response = self.openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=300 | |
| ) | |
| return [response.choices[0].message.content] | |
| except Exception as e: | |
| print(f"OpenAI error: {e}") | |
| return self.generate_insights(df_summary, analysis) |