smart-analytics-copilot / app /insight_generator.py
SamadhiDBS's picture
Upload 24 files
d18f851 verified
##________generate natural language insights from analysis_________##
import os
import json
from typing import Dict, Any
##___________________________
class InsightGenerator:
def __init__(self, use_openai=False, api_key=None):
self.use_openai = use_openai
if use_openai and api_key:
import openai
openai.api_key = api_key
self.openai = openai
else:
print(" Using template-based insight generation")
def generate_insights(self, df, schema, analysis):
"""generate human readable insights"""
insights = []
# 1.dataset overview
insights.append(f" **Dataset Overview**: Your dataset has {len(df)} rows and {len(df.columns)} columns.")
# 2. Key statistics
insights.extend(self._generate_statistical_insights(analysis['descriptive_stats']))
# 3. Correlation insights
insights.extend(self._generate_correlation_insights(analysis['correlations']))
# 4. Trend insights
insights.extend(self._generate_trend_insights(analysis['trends']))
# 5. Group analysis insights
insights.extend(self._generate_group_insights(analysis['group_analysis']))
# 6. Outlier insights
insights.extend(self._generate_outlier_insights(analysis['outliers']))
# 7. Distribution insights
insights.extend(self._generate_distribution_insights(analysis['distributions']))
# 8. Actionable recommendations
insights.extend(self._generate_recommendations(analysis))
return insights
def _generate_statistical_insights(self, stats):
"""generate insights from descriptive statistics"""
insights = []
for col, values in stats.items():
if values['mean'] > values['median'] * 1.2:
insights.append(f" **{col}** is right-skewed (mean {values['mean']:.2f} > median {values['median']:.2f}), suggesting some high values pulling the average up.")
elif values['median'] > values['mean'] * 1.2:
insights.append(f" **{col}** is left-skewed (median {values['median']:.2f} > mean {values['mean']:.2f}).")
return insights[:3] ### limit to top 3
def _generate_correlation_insights(self, correlations):
"""generate insights from correlations"""
insights = []
for corr in correlations[:3]: # Top 3 correlations
strength = "strong positive" if corr['strength'] == 'positive' else "strong negative"
insights.append(f" **{corr['col1']}** and **{corr['col2']}** show a {strength} correlation ({corr['correlation']:.2f}).")
if corr['strength'] == 'positive':
insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to increase as well.")
else:
insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to decrease.")
return insights
def _generate_trend_insights(self, trends):
"""generate insights from trends"""
insights =[]
for trend in trends:
direction = "increased" if trend['direction'] == 'increasing' else "decreased"
change_abs = abs(trend['percent_change'])
if change_abs > 20:
insights.append(f" **{trend['column']}** has {direction} significantly by {change_abs:.1f}% over time.")
elif change_abs > 5:
insights.append(f" **{trend['column']}** has {direction} by {change_abs:.1f}% over the period.")
return insights
def _generate_group_insights(self, group_analysis):
"""generate insights from group analysis"""
insights = []
for cat_col, analyses in group_analysis.items():
for num_col, analysis in analyses.items():
if analysis['top_category']:
insights.append(f" **{analysis['top_category']}** is the top performer in {cat_col} for {num_col} with {analysis['top_value']:.2f}.")
return insights[:3]
def _generate_outlier_insights(self, outliers):
"""generate insights about outliers"""
insights = []
for col, data in outliers.items():
if data['percentage'] < 5:
insights.append(f" **{col}** contains {data['count']} outliers ({data['percentage']:.1f}% of data). These might be worth investigating.")
return insights
def _generate_distribution_insights(self, distributions):
"""generate insights about distributions"""
insights = []
for col, dist in distributions.items():
if dist['shape'] != 'approximately normal':
insights.append(f" **{col}** has a {dist['shape']} distribution (skewness: {dist['skewness']:.2f}).")
return insights[:2]
def _generate_recommendations(self, analysis):
"""generate actionable recommendations"""
recommendations = []
# Check for opportunities
if analysis['correlations']:
strong_corr = analysis['correlations'][0]
if strong_corr['strength'] == 'positive':
recommendations.append(f" **Recommendation**: Focus on increasing {strong_corr['col1']} to potentially boost {strong_corr['col2']}.")
# Check for declining trends
for trend in analysis['trends']:
if trend['direction'] == 'decreasing' and abs(trend['percent_change']) > 10:
recommendations.append(f" **Action Required**: {trend['column']} is declining. Consider investigating causes.")
break
if not recommendations:
recommendations.append(" **Status**: No urgent issues detected. Continue monitoring key metrics.")
return recommendations
def generate_openai_insights(self, df_summary, analysis):
"""use OpenAI to generate insights"""
if not self.use_openai:
return self.generate_insights(df_summary, analysis)
prompt = f"""
You are a data analyst. Analyze this dataset and provide key business insights:
Dataset: {df_summary['rows']} rows, {df_summary['columns']} columns
Columns: {df_summary['column_names']}
Key Statistics: {analysis.get('descriptive_stats', {})}
Correlations: {analysis.get('correlations', [])}
Trends: {analysis.get('trends', [])}
Provide:
1. Top 3 key findings
2. One actionable recommendation
3. One question the user should explore further
Keep it concise and business-friendly.
"""
try:
response = self.openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
max_tokens=300
)
return [response.choices[0].message.content]
except Exception as e:
print(f"OpenAI error: {e}")
return self.generate_insights(df_summary, analysis)