Spaces:
Sleeping
Sleeping
| """ | |
| Report Generator Module - InsightGenAI | |
| ====================================== | |
| Professional PDF report generation with dataset overview, | |
| model performance metrics, SHAP summaries, and key insights. | |
| Author: InsightGenAI Team | |
| Version: 1.0.0 | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import io | |
| from typing import Dict, List, Tuple, Optional, Any, Union | |
| from datetime import datetime | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Import Streamlit for display functions | |
| import streamlit as st | |
| # ReportLab imports | |
| from reportlab.lib import colors | |
| from reportlab.lib.pagesizes import letter, A4 | |
| from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
| from reportlab.lib.units import inch | |
| from reportlab.platypus import ( | |
| SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, | |
| PageBreak, Image, ListFlowable, ListItem | |
| ) | |
| from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY | |
| class ReportGenerator: | |
| """ | |
| Professional PDF report generator for data analysis results. | |
| Generates comprehensive reports including: | |
| - Executive summary | |
| - Dataset overview | |
| - EDA results | |
| - Model performance | |
| - Feature importance | |
| - Key insights | |
| """ | |
| def __init__(self, title: str = "InsightGenAI Analysis Report"): | |
| """ | |
| Initialize the Report Generator. | |
| Args: | |
| title: Report title | |
| """ | |
| self.title = title | |
| self.sections: List[Dict] = [] | |
| self.styles = getSampleStyleSheet() | |
| self._setup_custom_styles() | |
| def _setup_custom_styles(self) -> None: | |
| """Setup custom paragraph styles.""" | |
| self.styles.add(ParagraphStyle( | |
| name='CustomTitle', | |
| parent=self.styles['Heading1'], | |
| fontSize=24, | |
| textColor=colors.HexColor('#1f77b4'), | |
| spaceAfter=30, | |
| alignment=TA_CENTER | |
| )) | |
| self.styles.add(ParagraphStyle( | |
| name='CustomHeading2', | |
| parent=self.styles['Heading2'], | |
| fontSize=16, | |
| textColor=colors.HexColor('#2c3e50'), | |
| spaceAfter=12, | |
| spaceBefore=12 | |
| )) | |
| self.styles.add(ParagraphStyle( | |
| name='CustomHeading3', | |
| parent=self.styles['Heading3'], | |
| fontSize=13, | |
| textColor=colors.HexColor('#34495e'), | |
| spaceAfter=10, | |
| spaceBefore=10 | |
| )) | |
| self.styles.add(ParagraphStyle( | |
| name='CustomBody', | |
| parent=self.styles['BodyText'], | |
| fontSize=10, | |
| alignment=TA_JUSTIFY, | |
| spaceAfter=10 | |
| )) | |
| def add_section(self, title: str, content: Any, section_type: str = 'text') -> None: | |
| """ | |
| Add a section to the report. | |
| Args: | |
| title: Section title | |
| content: Section content | |
| section_type: Type of section ('text', 'table', 'image', 'metrics') | |
| """ | |
| self.sections.append({ | |
| 'title': title, | |
| 'content': content, | |
| 'type': section_type | |
| }) | |
| def add_dataset_overview(self, df: pd.DataFrame, column_types: Dict[str, str]) -> None: | |
| """ | |
| Add dataset overview section. | |
| Args: | |
| df: Dataset | |
| column_types: Dictionary of column types | |
| """ | |
| # Overview metrics | |
| overview_data = [ | |
| ['Metric', 'Value'], | |
| ['Total Rows', f"{len(df):,}"], | |
| ['Total Columns', len(df.columns)], | |
| ['Numeric Columns', sum(1 for t in column_types.values() if t == 'numeric')], | |
| ['Categorical Columns', sum(1 for t in column_types.values() if t == 'categorical')], | |
| ['Text Columns', sum(1 for t in column_types.values() if t == 'text')], | |
| ['Missing Values', df.isnull().sum().sum()], | |
| ['Duplicate Rows', df.duplicated().sum()], | |
| ['Memory Usage (MB)', f"{df.memory_usage(deep=True).sum() / (1024*1024):.2f}"] | |
| ] | |
| self.add_section('Dataset Overview', overview_data, 'table') | |
| # Column types table | |
| column_data = [['Column Name', 'Data Type', 'Detected Type', 'Non-Null Count', 'Null Count']] | |
| for col in df.columns: | |
| column_data.append([ | |
| col, | |
| str(df[col].dtype), | |
| column_types.get(col, 'unknown'), | |
| f"{df[col].count():,}", | |
| f"{df[col].isnull().sum():,}" | |
| ]) | |
| self.add_section('Column Details', column_data, 'table') | |
| def add_eda_summary(self, eda_stats: Dict[str, pd.DataFrame]) -> None: | |
| """ | |
| Add EDA summary section. | |
| Args: | |
| eda_stats: Dictionary of EDA statistics | |
| """ | |
| if 'numeric' in eda_stats: | |
| # Convert numeric stats to table format | |
| numeric_df = eda_stats['numeric'].reset_index() | |
| numeric_data = [['Statistic'] + numeric_df.columns[1:].tolist()] | |
| for _, row in numeric_df.iterrows(): | |
| numeric_data.append([str(v) for v in row.values]) | |
| self.add_section('Numeric Columns Statistics', numeric_data, 'table') | |
| if 'categorical' in eda_stats: | |
| cat_df = eda_stats['categorical'].reset_index() | |
| cat_data = [['Column'] + cat_df.columns[1:].tolist()] | |
| for _, row in cat_df.head(20).iterrows(): # Limit to 20 rows | |
| cat_data.append([str(v) for v in row.values]) | |
| self.add_section('Categorical Columns Summary', cat_data, 'table') | |
| def add_model_results(self, model_info: Dict, comparison_df: pd.DataFrame) -> None: | |
| """ | |
| Add model results section. | |
| Args: | |
| model_info: Dictionary with model information | |
| comparison_df: Model comparison dataframe | |
| """ | |
| # Model info | |
| info_text = f""" | |
| <b>Task Type:</b> {model_info.get('task_type', 'N/A').title()}<br/> | |
| <b>Best Model:</b> {model_info.get('best_model', 'N/A')}<br/> | |
| <b>Models Trained:</b> {', '.join(model_info.get('models_trained', []))}<br/> | |
| <b>Feature Count:</b> {model_info.get('feature_count', 'N/A')}<br/> | |
| <b>Training Samples:</b> {model_info.get('training_samples', 'N/A'):,}<br/> | |
| <b>Test Samples:</b> {model_info.get('test_samples', 'N/A'):,} | |
| """ | |
| self.add_section('Model Training Information', info_text, 'text') | |
| # Comparison table | |
| if not comparison_df.empty: | |
| comp_data = [comparison_df.columns.tolist()] | |
| for _, row in comparison_df.iterrows(): | |
| comp_data.append([str(v) for v in row.values]) | |
| self.add_section('Model Comparison', comp_data, 'table') | |
| def add_feature_importance(self, importance_df: pd.DataFrame, | |
| top_n: int = 15) -> None: | |
| """ | |
| Add feature importance section. | |
| Args: | |
| importance_df: Feature importance dataframe | |
| top_n: Number of top features to include | |
| """ | |
| if importance_df is not None and not importance_df.empty: | |
| top_features = importance_df.head(top_n) | |
| imp_data = [['Rank', 'Feature', 'Importance']] | |
| for i, (_, row) in enumerate(top_features.iterrows(), 1): | |
| imp_data.append([str(i), row['feature'], f"{row['importance']:.6f}"]) | |
| self.add_section(f'Top {top_n} Feature Importance', imp_data, 'table') | |
| def add_insights(self, insights: Dict[str, Any]) -> None: | |
| """ | |
| Add key insights section. | |
| Args: | |
| insights: Dictionary of insights | |
| """ | |
| insights_text = "" | |
| if 'dataset_shape' in insights: | |
| insights_text += f"<b>Dataset Shape:</b> {insights['dataset_shape'][0]:,} rows × {insights['dataset_shape'][1]} columns<br/><br/>" | |
| if 'missing_percentage' in insights: | |
| insights_text += f"<b>Missing Values:</b> {insights['missing_percentage']:.2f}% of all values<br/><br/>" | |
| if 'duplicate_rows' in insights: | |
| insights_text += f"<b>Duplicate Rows:</b> {insights['duplicate_rows']:,}<br/><br/>" | |
| if 'high_correlation_pairs' in insights and insights['high_correlation_pairs']: | |
| insights_text += "<b>High Correlation Pairs (>0.8):</b><br/>" | |
| for pair in insights['high_correlation_pairs'][:5]: | |
| insights_text += f" • {pair['feature1']} & {pair['feature2']}: {pair['correlation']:.3f}<br/>" | |
| insights_text += "<br/>" | |
| if 'highly_skewed_features' in insights and insights['highly_skewed_features']: | |
| insights_text += "<b>Highly Skewed Features:</b><br/>" | |
| for feat in insights['highly_skewed_features'][:5]: | |
| insights_text += f" • {feat['feature']}: skewness = {feat['skewness']:.2f}<br/>" | |
| insights_text += "<br/>" | |
| if 'class_balance' in insights: | |
| insights_text += "<b>Class Distribution:</b><br/>" | |
| for cls, prop in list(insights['class_balance'].items())[:5]: | |
| insights_text += f" • {cls}: {prop*100:.1f}%<br/>" | |
| if insights.get('is_imbalanced'): | |
| insights_text += "<br/><i>Note: Dataset appears to be imbalanced.</i>" | |
| self.add_section('Key Insights', insights_text, 'text') | |
| def add_image(self, title: str, image_buffer: io.BytesIO, | |
| width: float = 6*inch, height: float = 4*inch) -> None: | |
| """ | |
| Add an image to the report. | |
| Args: | |
| title: Image section title | |
| image_buffer: BytesIO buffer containing the image | |
| width: Image width | |
| height: Image height | |
| """ | |
| self.add_section(title, {'buffer': image_buffer, 'width': width, 'height': height}, 'image') | |
| def generate_pdf(self, output_path: Optional[str] = None) -> Union[str, io.BytesIO]: | |
| """ | |
| Generate the PDF report. | |
| Args: | |
| output_path: Path to save the PDF (if None, returns BytesIO) | |
| Returns: | |
| Path to saved PDF or BytesIO buffer | |
| """ | |
| if output_path: | |
| doc = SimpleDocTemplate( | |
| output_path, | |
| pagesize=A4, | |
| rightMargin=72, | |
| leftMargin=72, | |
| topMargin=72, | |
| bottomMargin=18 | |
| ) | |
| else: | |
| buffer = io.BytesIO() | |
| doc = SimpleDocTemplate( | |
| buffer, | |
| pagesize=A4, | |
| rightMargin=72, | |
| leftMargin=72, | |
| topMargin=72, | |
| bottomMargin=18 | |
| ) | |
| # Build document content | |
| story = [] | |
| # Title page | |
| story.append(Paragraph(self.title, self.styles['CustomTitle'])) | |
| story.append(Spacer(1, 0.3*inch)) | |
| story.append(Paragraph( | |
| f"Generated by InsightGenAI on {datetime.now().strftime('%Y-%m-%d %H:%M')}", | |
| self.styles['CustomBody'] | |
| )) | |
| story.append(Spacer(1, 0.5*inch)) | |
| # Executive summary | |
| story.append(Paragraph("Executive Summary", self.styles['CustomHeading2'])) | |
| story.append(Paragraph( | |
| "This report provides a comprehensive analysis of your dataset, " | |
| "including exploratory data analysis, automated machine learning results, " | |
| "model performance metrics, and key insights derived from the data.", | |
| self.styles['CustomBody'] | |
| )) | |
| story.append(PageBreak()) | |
| # Add sections | |
| for section in self.sections: | |
| story.append(Paragraph(section['title'], self.styles['CustomHeading2'])) | |
| if section['type'] == 'text': | |
| story.append(Paragraph(section['content'], self.styles['CustomBody'])) | |
| elif section['type'] == 'table': | |
| table_data = section['content'] | |
| if table_data: | |
| # Create table | |
| table = Table(table_data, repeatRows=1) | |
| # Style the table | |
| style = TableStyle([ | |
| ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c3e50')), | |
| ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), | |
| ('ALIGN', (0, 0), (-1, -1), 'CENTER'), | |
| ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), | |
| ('FONTSIZE', (0, 0), (-1, 0), 10), | |
| ('BOTTOMPADDING', (0, 0), (-1, 0), 12), | |
| ('BACKGROUND', (0, 1), (-1, -1), colors.beige), | |
| ('GRID', (0, 0), (-1, -1), 1, colors.black), | |
| ('FONTSIZE', (0, 1), (-1, -1), 9), | |
| ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]) | |
| ]) | |
| table.setStyle(style) | |
| story.append(table) | |
| elif section['type'] == 'image': | |
| img_data = section['content'] | |
| img = Image(img_data['buffer'], | |
| width=img_data['width'], | |
| height=img_data['height']) | |
| story.append(img) | |
| elif section['type'] == 'metrics': | |
| metrics = section['content'] | |
| metrics_data = [['Metric', 'Value']] | |
| for key, value in metrics.items(): | |
| metrics_data.append([key, str(value)]) | |
| table = Table(metrics_data) | |
| table.setStyle(TableStyle([ | |
| ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#3498db')), | |
| ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), | |
| ('ALIGN', (0, 0), (-1, -1), 'CENTER'), | |
| ('GRID', (0, 0), (-1, -1), 1, colors.black) | |
| ])) | |
| story.append(table) | |
| story.append(Spacer(1, 0.3*inch)) | |
| # Footer | |
| story.append(Spacer(1, 0.5*inch)) | |
| story.append(Paragraph( | |
| "---<br/>Report generated by InsightGenAI - LLM Powered AutoML Data Analyst", | |
| self.styles['CustomBody'] | |
| )) | |
| # Build PDF | |
| doc.build(story) | |
| if output_path: | |
| return output_path | |
| else: | |
| buffer.seek(0) | |
| return buffer | |
| def save_figure_to_buffer(self, fig: plt.Figure) -> io.BytesIO: | |
| """ | |
| Save matplotlib figure to BytesIO buffer. | |
| Args: | |
| fig: Matplotlib figure | |
| Returns: | |
| BytesIO buffer with image data | |
| """ | |
| buffer = io.BytesIO() | |
| fig.savefig(buffer, format='png', dpi=150, bbox_inches='tight') | |
| buffer.seek(0) | |
| return buffer | |
| def generate_full_report(data_loader, eda_engine, automl_engine, | |
| explainer, insights: Dict) -> io.BytesIO: | |
| """ | |
| Generate a complete analysis report. | |
| Args: | |
| data_loader: DataLoader instance | |
| eda_engine: EDAEngine instance | |
| automl_engine: AutoMLEngine instance | |
| explainer: Explainability engine instance | |
| insights: Dictionary of insights | |
| Returns: | |
| BytesIO buffer containing the PDF | |
| """ | |
| report = ReportGenerator("InsightGenAI - Data Analysis Report") | |
| # Dataset overview | |
| if data_loader.df is not None: | |
| report.add_dataset_overview(data_loader.df, data_loader.column_types) | |
| # EDA summary | |
| if eda_engine: | |
| eda_stats = eda_engine.get_summary_statistics() | |
| report.add_eda_summary(eda_stats) | |
| # Model results | |
| if automl_engine and automl_engine.results: | |
| model_info = automl_engine.get_model_info() | |
| comparison_df = automl_engine.get_comparison_table() | |
| report.add_model_results(model_info, comparison_df) | |
| # Feature importance | |
| importance_df = automl_engine.get_feature_importance() | |
| report.add_feature_importance(importance_df) | |
| # Insights | |
| if insights: | |
| report.add_insights(insights) | |
| # Generate PDF | |
| return report.generate_pdf() | |
| # Streamlit display functions | |
| def display_report_download(data_loader, eda_engine, automl_engine, | |
| explainer, insights: Dict): | |
| """Display report download button in Streamlit.""" | |
| st.subheader("📄 Generate Report") | |
| if st.button("Generate PDF Report", type="primary"): | |
| with st.spinner("Generating report... This may take a moment."): | |
| try: | |
| pdf_buffer = generate_full_report( | |
| data_loader, eda_engine, automl_engine, | |
| explainer, insights | |
| ) | |
| st.download_button( | |
| label="📥 Download PDF Report", | |
| data=pdf_buffer, | |
| file_name=f"insightgenai_report_{datetime.now().strftime('%Y%m%d_%H%M')}.pdf", | |
| mime="application/pdf" | |
| ) | |
| st.success("Report generated successfully!") | |
| except Exception as e: | |
| st.error(f"Error generating report: {str(e)}") | |