Spaces:

mohsinbhatti
/

insightgenai

Sleeping

App Files Files Community

insightgenai / modules /report_generator.py

mohsinbhatti

Initial commit - InsightGenAI files

e478478 about 2 months ago

raw

history blame contribute delete

17.6 kB

	"""
	Report Generator Module - InsightGenAI
	======================================
	Professional PDF report generation with dataset overview,
	model performance metrics, SHAP summaries, and key insights.

	Author: InsightGenAI Team
	Version: 1.0.0
	"""

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import io
	from typing import Dict, List, Tuple, Optional, Any, Union
	from datetime import datetime
	import warnings
	warnings.filterwarnings('ignore')

	# Import Streamlit for display functions
	import streamlit as st

	# ReportLab imports
	from reportlab.lib import colors
	from reportlab.lib.pagesizes import letter, A4
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.units import inch
	from reportlab.platypus import (
	SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
	PageBreak, Image, ListFlowable, ListItem
	)
	from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY


	class ReportGenerator:
	"""
	Professional PDF report generator for data analysis results.

	Generates comprehensive reports including:
	- Executive summary
	- Dataset overview
	- EDA results
	- Model performance
	- Feature importance
	- Key insights
	"""

	def __init__(self, title: str = "InsightGenAI Analysis Report"):
	"""
	Initialize the Report Generator.

	Args:
	title: Report title
	"""
	self.title = title
	self.sections: List[Dict] = []
	self.styles = getSampleStyleSheet()
	self._setup_custom_styles()

	def _setup_custom_styles(self) -> None:
	"""Setup custom paragraph styles."""
	self.styles.add(ParagraphStyle(
	name='CustomTitle',
	parent=self.styles['Heading1'],
	fontSize=24,
	textColor=colors.HexColor('#1f77b4'),
	spaceAfter=30,
	alignment=TA_CENTER
	))

	self.styles.add(ParagraphStyle(
	name='CustomHeading2',
	parent=self.styles['Heading2'],
	fontSize=16,
	textColor=colors.HexColor('#2c3e50'),
	spaceAfter=12,
	spaceBefore=12
	))

	self.styles.add(ParagraphStyle(
	name='CustomHeading3',
	parent=self.styles['Heading3'],
	fontSize=13,
	textColor=colors.HexColor('#34495e'),
	spaceAfter=10,
	spaceBefore=10
	))

	self.styles.add(ParagraphStyle(
	name='CustomBody',
	parent=self.styles['BodyText'],
	fontSize=10,
	alignment=TA_JUSTIFY,
	spaceAfter=10
	))

	def add_section(self, title: str, content: Any, section_type: str = 'text') -> None:
	"""
	Add a section to the report.

	Args:
	title: Section title
	content: Section content
	section_type: Type of section ('text', 'table', 'image', 'metrics')
	"""
	self.sections.append({
	'title': title,
	'content': content,
	'type': section_type
	})

	def add_dataset_overview(self, df: pd.DataFrame, column_types: Dict[str, str]) -> None:
	"""
	Add dataset overview section.

	Args:
	df: Dataset
	column_types: Dictionary of column types
	"""
	# Overview metrics
	overview_data = [
	['Metric', 'Value'],
	['Total Rows', f"{len(df):,}"],
	['Total Columns', len(df.columns)],
	['Numeric Columns', sum(1 for t in column_types.values() if t == 'numeric')],
	['Categorical Columns', sum(1 for t in column_types.values() if t == 'categorical')],
	['Text Columns', sum(1 for t in column_types.values() if t == 'text')],
	['Missing Values', df.isnull().sum().sum()],
	['Duplicate Rows', df.duplicated().sum()],
	['Memory Usage (MB)', f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f}"]
	]

	self.add_section('Dataset Overview', overview_data, 'table')

	# Column types table
	column_data = [['Column Name', 'Data Type', 'Detected Type', 'Non-Null Count', 'Null Count']]
	for col in df.columns:
	column_data.append([
	col,
	str(df[col].dtype),
	column_types.get(col, 'unknown'),
	f"{df[col].count():,}",
	f"{df[col].isnull().sum():,}"
	])

	self.add_section('Column Details', column_data, 'table')

	def add_eda_summary(self, eda_stats: Dict[str, pd.DataFrame]) -> None:
	"""
	Add EDA summary section.

	Args:
	eda_stats: Dictionary of EDA statistics
	"""
	if 'numeric' in eda_stats:
	# Convert numeric stats to table format
	numeric_df = eda_stats['numeric'].reset_index()
	numeric_data = [['Statistic'] + numeric_df.columns[1:].tolist()]
	for _, row in numeric_df.iterrows():
	numeric_data.append([str(v) for v in row.values])

	self.add_section('Numeric Columns Statistics', numeric_data, 'table')

	if 'categorical' in eda_stats:
	cat_df = eda_stats['categorical'].reset_index()
	cat_data = [['Column'] + cat_df.columns[1:].tolist()]
	for _, row in cat_df.head(20).iterrows(): # Limit to 20 rows
	cat_data.append([str(v) for v in row.values])

	self.add_section('Categorical Columns Summary', cat_data, 'table')

	def add_model_results(self, model_info: Dict, comparison_df: pd.DataFrame) -> None:
	"""
	Add model results section.

	Args:
	model_info: Dictionary with model information
	comparison_df: Model comparison dataframe
	"""
	# Model info
	info_text = f"""
	<b>Task Type:</b> {model_info.get('task_type', 'N/A').title()}<br/>
	<b>Best Model:</b> {model_info.get('best_model', 'N/A')}<br/>
	<b>Models Trained:</b> {', '.join(model_info.get('models_trained', []))}<br/>
	<b>Feature Count:</b> {model_info.get('feature_count', 'N/A')}<br/>
	<b>Training Samples:</b> {model_info.get('training_samples', 'N/A'):,}<br/>
	<b>Test Samples:</b> {model_info.get('test_samples', 'N/A'):,}
	"""
	self.add_section('Model Training Information', info_text, 'text')

	# Comparison table
	if not comparison_df.empty:
	comp_data = [comparison_df.columns.tolist()]
	for _, row in comparison_df.iterrows():
	comp_data.append([str(v) for v in row.values])

	self.add_section('Model Comparison', comp_data, 'table')

	def add_feature_importance(self, importance_df: pd.DataFrame,
	top_n: int = 15) -> None:
	"""
	Add feature importance section.

	Args:
	importance_df: Feature importance dataframe
	top_n: Number of top features to include
	"""
	if importance_df is not None and not importance_df.empty:
	top_features = importance_df.head(top_n)

	imp_data = [['Rank', 'Feature', 'Importance']]
	for i, (_, row) in enumerate(top_features.iterrows(), 1):
	imp_data.append([str(i), row['feature'], f"{row['importance']:.6f}"])

	self.add_section(f'Top {top_n} Feature Importance', imp_data, 'table')

	def add_insights(self, insights: Dict[str, Any]) -> None:
	"""
	Add key insights section.

	Args:
	insights: Dictionary of insights
	"""
	insights_text = ""

	if 'dataset_shape' in insights:
	insights_text += f"<b>Dataset Shape:</b> {insights['dataset_shape'][0]:,} rows × {insights['dataset_shape'][1]} columns<br/><br/>"

	if 'missing_percentage' in insights:
	insights_text += f"<b>Missing Values:</b> {insights['missing_percentage']:.2f}% of all values<br/><br/>"

	if 'duplicate_rows' in insights:
	insights_text += f"<b>Duplicate Rows:</b> {insights['duplicate_rows']:,}<br/><br/>"

	if 'high_correlation_pairs' in insights and insights['high_correlation_pairs']:
	insights_text += "<b>High Correlation Pairs (>0.8):</b><br/>"
	for pair in insights['high_correlation_pairs'][:5]:
	insights_text += f" • {pair['feature1']} & {pair['feature2']}: {pair['correlation']:.3f}<br/>"
	insights_text += "<br/>"

	if 'highly_skewed_features' in insights and insights['highly_skewed_features']:
	insights_text += "<b>Highly Skewed Features:</b><br/>"
	for feat in insights['highly_skewed_features'][:5]:
	insights_text += f" • {feat['feature']}: skewness = {feat['skewness']:.2f}<br/>"
	insights_text += "<br/>"

	if 'class_balance' in insights:
	insights_text += "<b>Class Distribution:</b><br/>"
	for cls, prop in list(insights['class_balance'].items())[:5]:
	insights_text += f" • {cls}: {prop*100:.1f}%<br/>"
	if insights.get('is_imbalanced'):
	insights_text += "<br/><i>Note: Dataset appears to be imbalanced.</i>"

	self.add_section('Key Insights', insights_text, 'text')

	def add_image(self, title: str, image_buffer: io.BytesIO,
	width: float = 6inch, height: float = 4inch) -> None:
	"""
	Add an image to the report.

	Args:
	title: Image section title
	image_buffer: BytesIO buffer containing the image
	width: Image width
	height: Image height
	"""
	self.add_section(title, {'buffer': image_buffer, 'width': width, 'height': height}, 'image')

	def generate_pdf(self, output_path: Optional[str] = None) -> Union[str, io.BytesIO]:
	"""
	Generate the PDF report.

	Args:
	output_path: Path to save the PDF (if None, returns BytesIO)

	Returns:
	Path to saved PDF or BytesIO buffer
	"""
	if output_path:
	doc = SimpleDocTemplate(
	output_path,
	pagesize=A4,
	rightMargin=72,
	leftMargin=72,
	topMargin=72,
	bottomMargin=18
	)
	else:
	buffer = io.BytesIO()
	doc = SimpleDocTemplate(
	buffer,
	pagesize=A4,
	rightMargin=72,
	leftMargin=72,
	topMargin=72,
	bottomMargin=18
	)

	# Build document content
	story = []

	# Title page
	story.append(Paragraph(self.title, self.styles['CustomTitle']))
	story.append(Spacer(1, 0.3*inch))
	story.append(Paragraph(
	f"Generated by InsightGenAI on {datetime.now().strftime('%Y-%m-%d %H:%M')}",
	self.styles['CustomBody']
	))
	story.append(Spacer(1, 0.5*inch))

	# Executive summary
	story.append(Paragraph("Executive Summary", self.styles['CustomHeading2']))
	story.append(Paragraph(
	"This report provides a comprehensive analysis of your dataset, "
	"including exploratory data analysis, automated machine learning results, "
	"model performance metrics, and key insights derived from the data.",
	self.styles['CustomBody']
	))
	story.append(PageBreak())

	# Add sections
	for section in self.sections:
	story.append(Paragraph(section['title'], self.styles['CustomHeading2']))

	if section['type'] == 'text':
	story.append(Paragraph(section['content'], self.styles['CustomBody']))

	elif section['type'] == 'table':
	table_data = section['content']
	if table_data:
	# Create table
	table = Table(table_data, repeatRows=1)

	# Style the table
	style = TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c3e50')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'CENTER'),
	('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
	('FONTSIZE', (0, 0), (-1, 0), 10),
	('BOTTOMPADDING', (0, 0), (-1, 0), 12),
	('BACKGROUND', (0, 1), (-1, -1), colors.beige),
	('GRID', (0, 0), (-1, -1), 1, colors.black),
	('FONTSIZE', (0, 1), (-1, -1), 9),
	('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey])
	])
	table.setStyle(style)
	story.append(table)

	elif section['type'] == 'image':
	img_data = section['content']
	img = Image(img_data['buffer'],
	width=img_data['width'],
	height=img_data['height'])
	story.append(img)

	elif section['type'] == 'metrics':
	metrics = section['content']
	metrics_data = [['Metric', 'Value']]
	for key, value in metrics.items():
	metrics_data.append([key, str(value)])

	table = Table(metrics_data)
	table.setStyle(TableStyle([
	('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#3498db')),
	('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
	('ALIGN', (0, 0), (-1, -1), 'CENTER'),
	('GRID', (0, 0), (-1, -1), 1, colors.black)
	]))
	story.append(table)

	story.append(Spacer(1, 0.3*inch))

	# Footer
	story.append(Spacer(1, 0.5*inch))
	story.append(Paragraph(
	"---<br/>Report generated by InsightGenAI - LLM Powered AutoML Data Analyst",
	self.styles['CustomBody']
	))

	# Build PDF
	doc.build(story)

	if output_path:
	return output_path
	else:
	buffer.seek(0)
	return buffer

	def save_figure_to_buffer(self, fig: plt.Figure) -> io.BytesIO:
	"""
	Save matplotlib figure to BytesIO buffer.

	Args:
	fig: Matplotlib figure

	Returns:
	BytesIO buffer with image data
	"""
	buffer = io.BytesIO()
	fig.savefig(buffer, format='png', dpi=150, bbox_inches='tight')
	buffer.seek(0)
	return buffer


	def generate_full_report(data_loader, eda_engine, automl_engine,
	explainer, insights: Dict) -> io.BytesIO:
	"""
	Generate a complete analysis report.

	Args:
	data_loader: DataLoader instance
	eda_engine: EDAEngine instance
	automl_engine: AutoMLEngine instance
	explainer: Explainability engine instance
	insights: Dictionary of insights

	Returns:
	BytesIO buffer containing the PDF
	"""
	report = ReportGenerator("InsightGenAI - Data Analysis Report")

	# Dataset overview
	if data_loader.df is not None:
	report.add_dataset_overview(data_loader.df, data_loader.column_types)

	# EDA summary
	if eda_engine:
	eda_stats = eda_engine.get_summary_statistics()
	report.add_eda_summary(eda_stats)

	# Model results
	if automl_engine and automl_engine.results:
	model_info = automl_engine.get_model_info()
	comparison_df = automl_engine.get_comparison_table()
	report.add_model_results(model_info, comparison_df)

	# Feature importance
	importance_df = automl_engine.get_feature_importance()
	report.add_feature_importance(importance_df)

	# Insights
	if insights:
	report.add_insights(insights)

	# Generate PDF
	return report.generate_pdf()


	# Streamlit display functions
	def display_report_download(data_loader, eda_engine, automl_engine,
	explainer, insights: Dict):
	"""Display report download button in Streamlit."""
	st.subheader("📄 Generate Report")

	if st.button("Generate PDF Report", type="primary"):
	with st.spinner("Generating report... This may take a moment."):
	try:
	pdf_buffer = generate_full_report(
	data_loader, eda_engine, automl_engine,
	explainer, insights
	)

	st.download_button(
	label="📥 Download PDF Report",
	data=pdf_buffer,
	file_name=f"insightgenai_report_{datetime.now().strftime('%Y%m%d_%H%M')}.pdf",
	mime="application/pdf"
	)

	st.success("Report generated successfully!")
	except Exception as e:
	st.error(f"Error generating report: {str(e)}")