Spaces:

SamadhiDBS
/

smart-analytics-copilot

Sleeping

App Files Files Community

smart-analytics-copilot / app /insight_generator.py

SamadhiDBS

Upload 24 files

d18f851 verified about 1 month ago

raw

history blame contribute delete

7.44 kB

	##________generate natural language insights from analysis_________##

	import os
	import json
	from typing import Dict, Any


	##___________________________
	class InsightGenerator:
	def __init__(self, use_openai=False, api_key=None):
	self.use_openai = use_openai
	if use_openai and api_key:
	import openai
	openai.api_key = api_key
	self.openai = openai
	else:
	print(" Using template-based insight generation")

	def generate_insights(self, df, schema, analysis):
	"""generate human readable insights"""
	insights = []

	# 1.dataset overview
	insights.append(f" Dataset Overview: Your dataset has {len(df)} rows and {len(df.columns)} columns.")

	# 2. Key statistics
	insights.extend(self._generate_statistical_insights(analysis['descriptive_stats']))

	# 3. Correlation insights
	insights.extend(self._generate_correlation_insights(analysis['correlations']))

	# 4. Trend insights
	insights.extend(self._generate_trend_insights(analysis['trends']))

	# 5. Group analysis insights
	insights.extend(self._generate_group_insights(analysis['group_analysis']))

	# 6. Outlier insights
	insights.extend(self._generate_outlier_insights(analysis['outliers']))

	# 7. Distribution insights
	insights.extend(self._generate_distribution_insights(analysis['distributions']))

	# 8. Actionable recommendations
	insights.extend(self._generate_recommendations(analysis))

	return insights

	def _generate_statistical_insights(self, stats):
	"""generate insights from descriptive statistics"""

	insights = []

	for col, values in stats.items():
	if values['mean'] > values['median'] * 1.2:
	insights.append(f" {col} is right-skewed (mean {values['mean']:.2f} > median {values['median']:.2f}), suggesting some high values pulling the average up.")
	elif values['median'] > values['mean'] * 1.2:
	insights.append(f" {col} is left-skewed (median {values['median']:.2f} > mean {values['mean']:.2f}).")

	return insights[:3] ### limit to top 3

	def _generate_correlation_insights(self, correlations):
	"""generate insights from correlations"""
	insights = []

	for corr in correlations[:3]: # Top 3 correlations
	strength = "strong positive" if corr['strength'] == 'positive' else "strong negative"
	insights.append(f" {corr['col1']} and {corr['col2']} show a {strength} correlation ({corr['correlation']:.2f}).")

	if corr['strength'] == 'positive':
	insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to increase as well.")
	else:
	insights.append(f" → When {corr['col1']} increases, {corr['col2']} tends to decrease.")

	return insights

	def _generate_trend_insights(self, trends):
	"""generate insights from trends"""

	insights =[]

	for trend in trends:
	direction = "increased" if trend['direction'] == 'increasing' else "decreased"
	change_abs = abs(trend['percent_change'])

	if change_abs > 20:
	insights.append(f" {trend['column']} has {direction} significantly by {change_abs:.1f}% over time.")
	elif change_abs > 5:
	insights.append(f" {trend['column']} has {direction} by {change_abs:.1f}% over the period.")

	return insights

	def _generate_group_insights(self, group_analysis):
	"""generate insights from group analysis"""

	insights = []

	for cat_col, analyses in group_analysis.items():
	for num_col, analysis in analyses.items():
	if analysis['top_category']:
	insights.append(f" {analysis['top_category']} is the top performer in {cat_col} for {num_col} with {analysis['top_value']:.2f}.")

	return insights[:3]

	def _generate_outlier_insights(self, outliers):
	"""generate insights about outliers"""

	insights = []

	for col, data in outliers.items():
	if data['percentage'] < 5:
	insights.append(f" {col} contains {data['count']} outliers ({data['percentage']:.1f}% of data). These might be worth investigating.")

	return insights

	def _generate_distribution_insights(self, distributions):
	"""generate insights about distributions"""

	insights = []

	for col, dist in distributions.items():
	if dist['shape'] != 'approximately normal':
	insights.append(f" {col} has a {dist['shape']} distribution (skewness: {dist['skewness']:.2f}).")

	return insights[:2]

	def _generate_recommendations(self, analysis):
	"""generate actionable recommendations"""
	recommendations = []

	# Check for opportunities
	if analysis['correlations']:
	strong_corr = analysis['correlations'][0]
	if strong_corr['strength'] == 'positive':
	recommendations.append(f" Recommendation: Focus on increasing {strong_corr['col1']} to potentially boost {strong_corr['col2']}.")

	# Check for declining trends
	for trend in analysis['trends']:
	if trend['direction'] == 'decreasing' and abs(trend['percent_change']) > 10:
	recommendations.append(f" Action Required: {trend['column']} is declining. Consider investigating causes.")
	break

	if not recommendations:
	recommendations.append(" Status: No urgent issues detected. Continue monitoring key metrics.")

	return recommendations

	def generate_openai_insights(self, df_summary, analysis):
	"""use OpenAI to generate insights"""

	if not self.use_openai:
	return self.generate_insights(df_summary, analysis)

	prompt = f"""
	You are a data analyst. Analyze this dataset and provide key business insights:

	Dataset: {df_summary['rows']} rows, {df_summary['columns']} columns
	Columns: {df_summary['column_names']}

	Key Statistics: {analysis.get('descriptive_stats', {})}
	Correlations: {analysis.get('correlations', [])}
	Trends: {analysis.get('trends', [])}

	Provide:
	1. Top 3 key findings
	2. One actionable recommendation
	3. One question the user should explore further

	Keep it concise and business-friendly.
	"""

	try:
	response = self.openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[{"role": "user", "content": prompt}],
	max_tokens=300
	)
	return [response.choices[0].message.content]
	except Exception as e:
	print(f"OpenAI error: {e}")
	return self.generate_insights(df_summary, analysis)