insightgenai / modules /report_generator.py
mohsinbhatti's picture
Initial commit - InsightGenAI files
e478478
"""
Report Generator Module - InsightGenAI
======================================
Professional PDF report generation with dataset overview,
model performance metrics, SHAP summaries, and key insights.
Author: InsightGenAI Team
Version: 1.0.0
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
from typing import Dict, List, Tuple, Optional, Any, Union
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# Import Streamlit for display functions
import streamlit as st
# ReportLab imports
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
PageBreak, Image, ListFlowable, ListItem
)
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
class ReportGenerator:
"""
Professional PDF report generator for data analysis results.
Generates comprehensive reports including:
- Executive summary
- Dataset overview
- EDA results
- Model performance
- Feature importance
- Key insights
"""
def __init__(self, title: str = "InsightGenAI Analysis Report"):
"""
Initialize the Report Generator.
Args:
title: Report title
"""
self.title = title
self.sections: List[Dict] = []
self.styles = getSampleStyleSheet()
self._setup_custom_styles()
def _setup_custom_styles(self) -> None:
"""Setup custom paragraph styles."""
self.styles.add(ParagraphStyle(
name='CustomTitle',
parent=self.styles['Heading1'],
fontSize=24,
textColor=colors.HexColor('#1f77b4'),
spaceAfter=30,
alignment=TA_CENTER
))
self.styles.add(ParagraphStyle(
name='CustomHeading2',
parent=self.styles['Heading2'],
fontSize=16,
textColor=colors.HexColor('#2c3e50'),
spaceAfter=12,
spaceBefore=12
))
self.styles.add(ParagraphStyle(
name='CustomHeading3',
parent=self.styles['Heading3'],
fontSize=13,
textColor=colors.HexColor('#34495e'),
spaceAfter=10,
spaceBefore=10
))
self.styles.add(ParagraphStyle(
name='CustomBody',
parent=self.styles['BodyText'],
fontSize=10,
alignment=TA_JUSTIFY,
spaceAfter=10
))
def add_section(self, title: str, content: Any, section_type: str = 'text') -> None:
"""
Add a section to the report.
Args:
title: Section title
content: Section content
section_type: Type of section ('text', 'table', 'image', 'metrics')
"""
self.sections.append({
'title': title,
'content': content,
'type': section_type
})
def add_dataset_overview(self, df: pd.DataFrame, column_types: Dict[str, str]) -> None:
"""
Add dataset overview section.
Args:
df: Dataset
column_types: Dictionary of column types
"""
# Overview metrics
overview_data = [
['Metric', 'Value'],
['Total Rows', f"{len(df):,}"],
['Total Columns', len(df.columns)],
['Numeric Columns', sum(1 for t in column_types.values() if t == 'numeric')],
['Categorical Columns', sum(1 for t in column_types.values() if t == 'categorical')],
['Text Columns', sum(1 for t in column_types.values() if t == 'text')],
['Missing Values', df.isnull().sum().sum()],
['Duplicate Rows', df.duplicated().sum()],
['Memory Usage (MB)', f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f}"]
]
self.add_section('Dataset Overview', overview_data, 'table')
# Column types table
column_data = [['Column Name', 'Data Type', 'Detected Type', 'Non-Null Count', 'Null Count']]
for col in df.columns:
column_data.append([
col,
str(df[col].dtype),
column_types.get(col, 'unknown'),
f"{df[col].count():,}",
f"{df[col].isnull().sum():,}"
])
self.add_section('Column Details', column_data, 'table')
def add_eda_summary(self, eda_stats: Dict[str, pd.DataFrame]) -> None:
"""
Add EDA summary section.
Args:
eda_stats: Dictionary of EDA statistics
"""
if 'numeric' in eda_stats:
# Convert numeric stats to table format
numeric_df = eda_stats['numeric'].reset_index()
numeric_data = [['Statistic'] + numeric_df.columns[1:].tolist()]
for _, row in numeric_df.iterrows():
numeric_data.append([str(v) for v in row.values])
self.add_section('Numeric Columns Statistics', numeric_data, 'table')
if 'categorical' in eda_stats:
cat_df = eda_stats['categorical'].reset_index()
cat_data = [['Column'] + cat_df.columns[1:].tolist()]
for _, row in cat_df.head(20).iterrows(): # Limit to 20 rows
cat_data.append([str(v) for v in row.values])
self.add_section('Categorical Columns Summary', cat_data, 'table')
def add_model_results(self, model_info: Dict, comparison_df: pd.DataFrame) -> None:
"""
Add model results section.
Args:
model_info: Dictionary with model information
comparison_df: Model comparison dataframe
"""
# Model info
info_text = f"""
<b>Task Type:</b> {model_info.get('task_type', 'N/A').title()}<br/>
<b>Best Model:</b> {model_info.get('best_model', 'N/A')}<br/>
<b>Models Trained:</b> {', '.join(model_info.get('models_trained', []))}<br/>
<b>Feature Count:</b> {model_info.get('feature_count', 'N/A')}<br/>
<b>Training Samples:</b> {model_info.get('training_samples', 'N/A'):,}<br/>
<b>Test Samples:</b> {model_info.get('test_samples', 'N/A'):,}
"""
self.add_section('Model Training Information', info_text, 'text')
# Comparison table
if not comparison_df.empty:
comp_data = [comparison_df.columns.tolist()]
for _, row in comparison_df.iterrows():
comp_data.append([str(v) for v in row.values])
self.add_section('Model Comparison', comp_data, 'table')
def add_feature_importance(self, importance_df: pd.DataFrame,
top_n: int = 15) -> None:
"""
Add feature importance section.
Args:
importance_df: Feature importance dataframe
top_n: Number of top features to include
"""
if importance_df is not None and not importance_df.empty:
top_features = importance_df.head(top_n)
imp_data = [['Rank', 'Feature', 'Importance']]
for i, (_, row) in enumerate(top_features.iterrows(), 1):
imp_data.append([str(i), row['feature'], f"{row['importance']:.6f}"])
self.add_section(f'Top {top_n} Feature Importance', imp_data, 'table')
def add_insights(self, insights: Dict[str, Any]) -> None:
"""
Add key insights section.
Args:
insights: Dictionary of insights
"""
insights_text = ""
if 'dataset_shape' in insights:
insights_text += f"<b>Dataset Shape:</b> {insights['dataset_shape'][0]:,} rows × {insights['dataset_shape'][1]} columns<br/><br/>"
if 'missing_percentage' in insights:
insights_text += f"<b>Missing Values:</b> {insights['missing_percentage']:.2f}% of all values<br/><br/>"
if 'duplicate_rows' in insights:
insights_text += f"<b>Duplicate Rows:</b> {insights['duplicate_rows']:,}<br/><br/>"
if 'high_correlation_pairs' in insights and insights['high_correlation_pairs']:
insights_text += "<b>High Correlation Pairs (>0.8):</b><br/>"
for pair in insights['high_correlation_pairs'][:5]:
insights_text += f" • {pair['feature1']} & {pair['feature2']}: {pair['correlation']:.3f}<br/>"
insights_text += "<br/>"
if 'highly_skewed_features' in insights and insights['highly_skewed_features']:
insights_text += "<b>Highly Skewed Features:</b><br/>"
for feat in insights['highly_skewed_features'][:5]:
insights_text += f" • {feat['feature']}: skewness = {feat['skewness']:.2f}<br/>"
insights_text += "<br/>"
if 'class_balance' in insights:
insights_text += "<b>Class Distribution:</b><br/>"
for cls, prop in list(insights['class_balance'].items())[:5]:
insights_text += f" • {cls}: {prop*100:.1f}%<br/>"
if insights.get('is_imbalanced'):
insights_text += "<br/><i>Note: Dataset appears to be imbalanced.</i>"
self.add_section('Key Insights', insights_text, 'text')
def add_image(self, title: str, image_buffer: io.BytesIO,
width: float = 6*inch, height: float = 4*inch) -> None:
"""
Add an image to the report.
Args:
title: Image section title
image_buffer: BytesIO buffer containing the image
width: Image width
height: Image height
"""
self.add_section(title, {'buffer': image_buffer, 'width': width, 'height': height}, 'image')
def generate_pdf(self, output_path: Optional[str] = None) -> Union[str, io.BytesIO]:
"""
Generate the PDF report.
Args:
output_path: Path to save the PDF (if None, returns BytesIO)
Returns:
Path to saved PDF or BytesIO buffer
"""
if output_path:
doc = SimpleDocTemplate(
output_path,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
else:
buffer = io.BytesIO()
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Build document content
story = []
# Title page
story.append(Paragraph(self.title, self.styles['CustomTitle']))
story.append(Spacer(1, 0.3*inch))
story.append(Paragraph(
f"Generated by InsightGenAI on {datetime.now().strftime('%Y-%m-%d %H:%M')}",
self.styles['CustomBody']
))
story.append(Spacer(1, 0.5*inch))
# Executive summary
story.append(Paragraph("Executive Summary", self.styles['CustomHeading2']))
story.append(Paragraph(
"This report provides a comprehensive analysis of your dataset, "
"including exploratory data analysis, automated machine learning results, "
"model performance metrics, and key insights derived from the data.",
self.styles['CustomBody']
))
story.append(PageBreak())
# Add sections
for section in self.sections:
story.append(Paragraph(section['title'], self.styles['CustomHeading2']))
if section['type'] == 'text':
story.append(Paragraph(section['content'], self.styles['CustomBody']))
elif section['type'] == 'table':
table_data = section['content']
if table_data:
# Create table
table = Table(table_data, repeatRows=1)
# Style the table
style = TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c3e50')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 10),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('GRID', (0, 0), (-1, -1), 1, colors.black),
('FONTSIZE', (0, 1), (-1, -1), 9),
('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey])
])
table.setStyle(style)
story.append(table)
elif section['type'] == 'image':
img_data = section['content']
img = Image(img_data['buffer'],
width=img_data['width'],
height=img_data['height'])
story.append(img)
elif section['type'] == 'metrics':
metrics = section['content']
metrics_data = [['Metric', 'Value']]
for key, value in metrics.items():
metrics_data.append([key, str(value)])
table = Table(metrics_data)
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#3498db')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
story.append(table)
story.append(Spacer(1, 0.3*inch))
# Footer
story.append(Spacer(1, 0.5*inch))
story.append(Paragraph(
"---<br/>Report generated by InsightGenAI - LLM Powered AutoML Data Analyst",
self.styles['CustomBody']
))
# Build PDF
doc.build(story)
if output_path:
return output_path
else:
buffer.seek(0)
return buffer
def save_figure_to_buffer(self, fig: plt.Figure) -> io.BytesIO:
"""
Save matplotlib figure to BytesIO buffer.
Args:
fig: Matplotlib figure
Returns:
BytesIO buffer with image data
"""
buffer = io.BytesIO()
fig.savefig(buffer, format='png', dpi=150, bbox_inches='tight')
buffer.seek(0)
return buffer
def generate_full_report(data_loader, eda_engine, automl_engine,
explainer, insights: Dict) -> io.BytesIO:
"""
Generate a complete analysis report.
Args:
data_loader: DataLoader instance
eda_engine: EDAEngine instance
automl_engine: AutoMLEngine instance
explainer: Explainability engine instance
insights: Dictionary of insights
Returns:
BytesIO buffer containing the PDF
"""
report = ReportGenerator("InsightGenAI - Data Analysis Report")
# Dataset overview
if data_loader.df is not None:
report.add_dataset_overview(data_loader.df, data_loader.column_types)
# EDA summary
if eda_engine:
eda_stats = eda_engine.get_summary_statistics()
report.add_eda_summary(eda_stats)
# Model results
if automl_engine and automl_engine.results:
model_info = automl_engine.get_model_info()
comparison_df = automl_engine.get_comparison_table()
report.add_model_results(model_info, comparison_df)
# Feature importance
importance_df = automl_engine.get_feature_importance()
report.add_feature_importance(importance_df)
# Insights
if insights:
report.add_insights(insights)
# Generate PDF
return report.generate_pdf()
# Streamlit display functions
def display_report_download(data_loader, eda_engine, automl_engine,
explainer, insights: Dict):
"""Display report download button in Streamlit."""
st.subheader("📄 Generate Report")
if st.button("Generate PDF Report", type="primary"):
with st.spinner("Generating report... This may take a moment."):
try:
pdf_buffer = generate_full_report(
data_loader, eda_engine, automl_engine,
explainer, insights
)
st.download_button(
label="📥 Download PDF Report",
data=pdf_buffer,
file_name=f"insightgenai_report_{datetime.now().strftime('%Y%m%d_%H%M')}.pdf",
mime="application/pdf"
)
st.success("Report generated successfully!")
except Exception as e:
st.error(f"Error generating report: {str(e)}")