Spaces:

shukdevdattaEX
/

Data-Summarizer-Excel-CSV

Paused

App Files Files Community

shukdevdattaEX commited on Sep 1, 2025

Commit

939abbc

verified ·

1 Parent(s): 929709a

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -156

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import pandas as pd
 import aiohttp
 import asyncio
 import json
-import io
 import os
 import numpy as np
 import plotly.express as px
@@ -12,10 +11,7 @@ from typing import Optional, Tuple, Dict, Any
 import logging
 from datetime import datetime
 import re
-import base64
-from io import BytesIO
-import weasyprint  # For PDF generation
-from jinja2 import Template  # For HTML templating
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -58,25 +54,18 @@ class EnhancedDataAnalyzer:
         # Create context-aware prompt
         if user_question:
             prompt = f"""You are a data analyst expert. Based on this dataset:
 {data_summary}
 User's specific question: {user_question}
 Provide a detailed, actionable answer with specific data points and recommendations."""
         else:
             prompt = f"""You are a senior data analyst. Analyze this dataset thoroughly:
 {data_summary}
 Provide a comprehensive analysis including:
 1. **Key Statistical Insights**: Most important numbers and what they mean
 2. **Patterns & Trends**: Notable patterns, correlations, or anomalies
 3. **Data Quality Assessment**: Missing values, outliers, data consistency
 4. **Business Intelligence**: Actionable insights and opportunities
 5. **Recommendations**: Specific next steps or areas to investigate
 Format your response with clear sections and bullet points for readability."""
         body = {
@@ -93,12 +82,12 @@ Format your response with clear sections and bullet points for readability."""
             ],
             "stream": True,
             "max_tokens": 3000,
-            "temperature": 0.2,  # Very low for consistent analysis
             "top_p": 0.9
         }
         try:
-            timeout = aiohttp.ClientTimeout(total=30)  # 30 second timeout
             async with aiohttp.ClientSession(timeout=timeout) as session:
                 async with session.post(self.api_base_url, headers=headers, json=body) as response:
                     if response.status == 401:
@@ -138,9 +127,7 @@ Format your response with clear sections and bullet points for readability."""
         try:
             file_extension = os.path.splitext(file_path)[1].lower()
-            # Read file with better error handling
             if file_extension == '.csv':
-                # Try different encodings
                 for encoding in ['utf-8', 'latin-1', 'cp1252']:
                     try:
                         df = pd.read_csv(file_path, encoding=encoding)
@@ -154,13 +141,8 @@ Format your response with clear sections and bullet points for readability."""
             else:
                 raise ValueError("Unsupported file format. Please upload CSV or Excel files.")
-            # Clean column names
             df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
-            # Store dataframe for visualizations
             self.current_df = df
-            # Generate enhanced summaries
             data_summary = self.generate_enhanced_summary(df)
             charts_html = self.generate_visualizations(df)
@@ -172,23 +154,17 @@ Format your response with clear sections and bullet points for readability."""
     def generate_enhanced_summary(self, df: pd.DataFrame) -> str:
         """Generate comprehensive data summary with statistical insights"""
         summary = []
-        # Header with timestamp
         summary.append(f"# 📊 Dataset Analysis Report")
         summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
         summary.append(f"**File Size**: {df.shape[0]:,} rows × {df.shape[1]} columns")
-        # Memory usage
         memory_usage = df.memory_usage(deep=True).sum() / 1024**2
         summary.append(f"**Memory Usage**: {memory_usage:.2f} MB\n")
-        # Data types breakdown
         type_counts = df.dtypes.value_counts()
         summary.append("## 📋 Column Types:")
         for dtype, count in type_counts.items():
             summary.append(f"- **{dtype}**: {count} columns")
-        # Missing data analysis
         missing_data = df.isnull().sum()
         missing_pct = (missing_data / len(df) * 100).round(2)
         missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
@@ -201,26 +177,23 @@ Format your response with clear sections and bullet points for readability."""
         else:
             summary.append("\n## ✅ Data Quality: No missing values detected!")
-        # Numerical analysis
         numeric_cols = df.select_dtypes(include=[np.number]).columns
         if len(numeric_cols) > 0:
             summary.append(f"\n## 📈 Numerical Columns Analysis ({len(numeric_cols)} columns):")
-            for col in numeric_cols[:10]:  # Limit to first 10
                 stats = df[col].describe()
                 outliers = len(df[df[col] > (stats['75%'] + 1.5 * (stats['75%'] - stats['25%']))])
                 summary.append(f"- **{col}**: μ={stats['mean']:.2f}, σ={stats['std']:.2f}, outliers={outliers}")
-        # Categorical analysis
         categorical_cols = df.select_dtypes(include=['object', 'category']).columns
         if len(categorical_cols) > 0:
             summary.append(f"\n## 📝 Categorical Columns Analysis ({len(categorical_cols)} columns):")
-            for col in categorical_cols[:10]:  # Limit to first 10
                 unique_count = df[col].nunique()
                 cardinality = "High" if unique_count > len(df) * 0.9 else "Medium" if unique_count > 10 else "Low"
                 most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
                 summary.append(f"- **{col}**: {unique_count:,} unique values ({cardinality} cardinality), Top: '{most_common}'")
-        # Sample data with better formatting
         summary.append("\n## 🔍 Data Sample (First 3 Rows):")
         sample_df = df.head(3)
         for idx, row in sample_df.iterrows():
@@ -235,7 +208,6 @@ Format your response with clear sections and bullet points for readability."""
         charts_html = []
         try:
-            # Chart 1: Data completeness analysis
             missing_data = df.isnull().sum()
             if missing_data.sum() > 0:
                 fig = px.bar(
@@ -255,7 +227,6 @@ Format your response with clear sections and bullet points for readability."""
                 charts_html.append(f"<h3>📊 Data Quality Overview</h3>")
                 charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_chart"))
-            # Chart 2: Numerical columns correlation heatmap
             numeric_cols = df.select_dtypes(include=[np.number]).columns
             if len(numeric_cols) > 1:
                 corr_matrix = df[numeric_cols].corr()
@@ -270,9 +241,8 @@ Format your response with clear sections and bullet points for readability."""
                 charts_html.append(f"<h3>📈 Correlation Analysis</h3>")
                 charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_chart"))
-            # Chart 3: Distribution plots for numerical columns
             if len(numeric_cols) > 0:
-                for i, col in enumerate(numeric_cols[:3]):  # First 3 numeric columns
                     fig = px.histogram(
                         df,
                         x=col,
@@ -285,11 +255,10 @@ Format your response with clear sections and bullet points for readability."""
                         charts_html.append(f"<h3>📈 Data Distributions</h3>")
                     charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"dist_chart_{i}"))
-            # Chart 4: Categorical analysis
             categorical_cols = df.select_dtypes(include=['object', 'category']).columns
             if len(categorical_cols) > 0:
-                for i, col in enumerate(categorical_cols[:2]):  # First 2 categorical columns
-                    if df[col].nunique() <= 20:  # Only if reasonable number of categories
                         value_counts = df[col].value_counts().head(10)
                         fig = px.bar(
                             x=value_counts.values,
@@ -303,7 +272,6 @@ Format your response with clear sections and bullet points for readability."""
                             charts_html.append(f"<h3>📝 Categorical Data Analysis</h3>")
                         charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"cat_chart_{i}"))
-            # Chart 5: Data overview summary
             summary_data = {
                 'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 'Missing Values'],
                 'Count': [
@@ -327,9 +295,7 @@ Format your response with clear sections and bullet points for readability."""
             charts_html.append(f"<h3>📊 Dataset Overview</h3>")
             charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_chart"))
-            # Store charts for export
             self.current_charts = charts_html
             return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
         except Exception as e:
@@ -337,8 +303,7 @@ Format your response with clear sections and bullet points for readability."""
             return f"<p>❌ Chart generation failed: {str(e)}</p>"
     def generate_report_html(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
-        """Generate HTML report with embedded charts"""
         html_template = """
         <!DOCTYPE html>
         <html>
@@ -377,7 +342,11 @@ Format your response with clear sections and bullet points for readability."""
                     border-radius: 8px;
                     border-left: 4px solid #667eea;
                 }
-                h1, h2, h3 { color: #2c3e50; }
                 .metadata {
                     background: #e8f4f8;
                     padding: 15px;
@@ -398,8 +367,43 @@ Format your response with clear sections and bullet points for readability."""
                     border-radius: 5px;
                     overflow-x: auto;
                     white-space: pre-wrap;
                 }
             </style>
         </head>
         <body>
             <div class="header">
@@ -415,6 +419,7 @@ Format your response with clear sections and bullet points for readability."""
             <div class="section">
                 <h2>🎯 AI Analysis & Insights</h2>
                 <div>{{ ai_analysis }}</div>
             </div>
@@ -439,13 +444,7 @@ Format your response with clear sections and bullet points for readability."""
         """
         template = Template(html_template)
-        # Convert markdown to HTML for AI analysis
-        ai_analysis_html = analysis_text.replace('\n', '<br>')
-        ai_analysis_html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', ai_analysis_html)
-        ai_analysis_html = re.sub(r'## (.*?)\n', r'<h3>\1</h3>', ai_analysis_html)
-        ai_analysis_html = re.sub(r'# (.*?)\n', r'<h2>\1</h2>', ai_analysis_html)
         charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
         return template.render(
@@ -456,52 +455,37 @@ Format your response with clear sections and bullet points for readability."""
             data_summary=data_summary
         )
-# Initialize the analyzer
 analyzer = EnhancedDataAnalyzer()
 async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
-    """Enhanced analysis function with progress tracking"""
     if not file:
         return "❌ Please upload a CSV or Excel file.", "", "", "", None
     if not analyzer.validate_api_key(api_key):
         return "❌ Please enter a valid Chutes API key (minimum 10 characters).", "", "", "", None
-    # Validate file
     is_valid, validation_msg = analyzer.validate_file(file)
     if not is_valid:
         return f"❌ {validation_msg}", "", "", "", None
     progress(0.1, desc="📁 Reading file...")
     try:
-        # Process the uploaded file
         df, data_summary, charts_html = analyzer.process_file(file.name)
         progress(0.3, desc="📊 Processing data...")
         progress(0.5, desc="🤖 Generating AI insights...")
-        # Get AI analysis
         ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
         progress(0.9, desc="✨ Finalizing results...")
-        # Format the complete response
         response = f"""# 🎯 Analysis Complete!
 {ai_analysis}
 ---
 *Analysis powered by OpenAI gpt-oss-20b via Chutes • Generated at {datetime.now().strftime('%H:%M:%S')}*
 """
-        # Generate data preview
         data_preview_html = df.head(15).to_html(
             classes="table table-striped table-hover",
             table_id="data-preview-table",
             escape=False
         )
-        # Add some styling to the preview
         styled_preview = f"""
         <style>
             #data-preview-table {{
@@ -536,17 +520,14 @@ async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
         return f"❌ **Error**: {str(e)}", "", "", "", None
 def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
-    """Synchronous wrapper for the async analyze function"""
     return asyncio.run(analyze_data(file, api_key, user_question, progress))
 def clear_all():
-    """Clear all inputs and outputs"""
     analyzer.current_df = None
     analyzer.current_charts = None
     return None, "", "", "", "", "", "", None
 def download_report(analysis_text, data_summary, file_name, format_choice):
-    """Generate downloadable report in PDF or HTML format"""
     if not analysis_text:
         return None, "❌ No analysis data available for download."
@@ -555,47 +536,30 @@ def download_report(analysis_text, data_summary, file_name, format_choice):
     try:
         if format_choice == "HTML":
-            # Generate HTML report
             html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
             filename = f"{file_base_name}_analysis_report_{timestamp}.html"
             with open(filename, 'w', encoding='utf-8') as f:
                 f.write(html_content)
             return filename, f"✅ HTML report generated successfully! File: {filename}"
-        elif format_choice == "PDF":
-            # Generate PDF report
-            html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
-            filename = f"{file_base_name}_analysis_report_{timestamp}.pdf"
-            # Convert HTML to PDF using weasyprint
-            weasyprint.HTML(string=html_content).write_pdf(filename)
-            return filename, f"✅ PDF report generated successfully! File: {filename}"
-        else:  # Markdown fallback
             report = f"""# Data Analysis Report
 Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
 File: {file_name}
 ## AI Analysis:
 {analysis_text}
 ## Raw Data Summary:
 {data_summary}
 """
             filename = f"{file_base_name}_analysis_report_{timestamp}.md"
             with open(filename, 'w', encoding='utf-8') as f:
                 f.write(report)
             return filename, f"✅ Markdown report generated successfully! File: {filename}"
     except Exception as e:
         logger.error(f"Report generation error: {str(e)}")
         return None, f"❌ Error generating report: {str(e)}"
-# Create enhanced Gradio interface
 with gr.Blocks(
     title="🚀 Smart Data Analyzer Pro",
     theme=gr.themes.Ocean(),
@@ -613,33 +577,20 @@ with gr.Blocks(
         text-align: center;
         background: #f8f9ff;
     }
-    .charts-container {
-        max-height: 800px;
-        overflow-y: auto;
-        padding: 10px;
-        background: #fafafa;
-        border-radius: 8px;
-    }
     """
 ) as app:
-    # Store file name for downloads
     current_file_name = gr.State("")
-    # Header
     gr.Markdown("""
     # 🚀 Smart Data Analyzer Pro
     ### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
-    Upload your data files and get instant professional insights, visualizations, and downloadable reports!
     """)
-    # Main interface
     with gr.Row():
         with gr.Column(scale=1):
-            # Configuration section
             gr.Markdown("### ⚙️ Configuration")
             api_key_input = gr.Textbox(
                 label="🔑 Chutes API Key",
                 placeholder="sk-chutes-your-api-key-here...",
@@ -647,19 +598,15 @@ with gr.Blocks(
                 lines=1,
                 info="Get your free API key from chutes.ai"
             )
             file_input = gr.File(
                 label="📁 Upload Data File",
                 file_types=[".csv", ".xlsx", ".xls"],
                 file_count="single",
                 elem_classes=["upload-area"]
             )
             with gr.Row():
                 analyze_btn = gr.Button("🚀 Analyze Data", variant="primary", size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
-            # Quick stats display
             with gr.Group():
                 gr.Markdown("### 📊 Quick Stats")
                 file_stats = gr.Textbox(
@@ -670,15 +617,12 @@ with gr.Blocks(
                 )
         with gr.Column(scale=2):
-            # Results section
             gr.Markdown("### 🎯 Analysis Results")
             analysis_output = gr.Markdown(
                 value="📋 **Ready to analyze your data!**\n\nUpload a CSV or Excel file and click 'Analyze Data' to get started.",
                 show_label=False
             )
-    # Advanced features in tabs
     with gr.Tabs():
         with gr.Tab("💬 Ask Questions"):
             question_input = gr.Textbox(
@@ -695,14 +639,6 @@ with gr.Blocks(
                 value="<p>Upload a file to see data preview...</p>"
             )
-        with gr.Tab("📈 Visualizations"):
-            charts_output = gr.HTML(
-                label="Auto-Generated Charts",
-                value="<div class='charts-container'><p>📊 Interactive charts will appear here after analysis...</p></div>",
-                elem_classes=["charts-container"],
-                visible=False
-            )
         with gr.Tab("🔍 Raw Summary"):
             raw_summary = gr.Textbox(
                 label="Detailed Data Summary",
@@ -713,56 +649,47 @@ with gr.Blocks(
         with gr.Tab("💾 Export Reports"):
             gr.Markdown("### 📥 Download Your Analysis Report")
             with gr.Row():
                 format_choice = gr.Radio(
-                    choices=["HTML", "PDF", "Markdown"],
                     value="HTML",
                     label="📄 Report Format",
                     info="Choose your preferred download format"
                 )
             download_btn = gr.Button("📥 Generate & Download Report", variant="primary", size="lg")
             download_status = gr.Textbox(label="Download Status", interactive=False)
             download_file = gr.File(label="📄 Download Link", visible=True)
-    # Event handlers
     def update_file_stats(file):
         if not file:
             return "No file uploaded"
         try:
-            file_size = os.path.getsize(file.name) / (1024 * 1024)  # MB
             file_name = os.path.basename(file.name)
             return f"📄 **File**: {file_name}\n📏 **Size**: {file_size:.2f} MB\n⏰ **Uploaded**: {datetime.now().strftime('%H:%M:%S')}"
         except:
             return "File information unavailable"
     def handle_analysis(file, api_key, user_question="", progress=gr.Progress()):
-        """Handle main analysis and return all outputs including file name"""
         result = sync_analyze_data(file, api_key, user_question, progress)
-        if len(result) == 5:  # Check if file name was returned
-            return result[0], result[1], result[2], result[3], result[4]  # analysis, summary, preview, charts, filename
         else:
-            return result[0], result[1], result[2], result[3], ""  # fallback without filename
     def handle_question_analysis(file, api_key, question, progress=gr.Progress()):
-        """Handle question-specific analysis"""
         if not question.strip():
             return "❓ Please enter a specific question about your data."
         result = sync_analyze_data(file, api_key, question, progress)
-        return result[0]  # Return only the analysis output
-    # Main analysis event
     analyze_btn.click(
         fn=handle_analysis,
         inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
-        outputs=[analysis_output, raw_summary, data_preview, charts_output, current_file_name],
         show_progress=True
     )
-    # Follow-up questions
     ask_btn.click(
         fn=handle_question_analysis,
         inputs=[file_input, api_key_input, question_input],
@@ -770,28 +697,24 @@ with gr.Blocks(
         show_progress=True
     )
-    # File stats update
     file_input.change(
         fn=update_file_stats,
         inputs=[file_input],
         outputs=[file_stats]
     )
-    # Clear functionality
     clear_btn.click(
         fn=clear_all,
         outputs=[file_input, api_key_input, question_input, analysis_output,
-                question_output, data_preview, charts_output, raw_summary]
     )
-    # Enhanced download functionality
     download_btn.click(
         fn=download_report,
         inputs=[analysis_output, raw_summary, current_file_name, format_choice],
         outputs=[download_file, download_status]
     )
-    # Footer with usage tips
     gr.Markdown("""
     ---
     ### 💡 Pro Tips for Better Analysis:
@@ -801,16 +724,8 @@ with gr.Blocks(
     - Use descriptive column names
     - Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
-    **📊 Visualizations Include:**
-    - Missing data analysis
-    - Correlation matrices for numerical data
-    - Distribution plots and histograms
-    - Top categories for categorical data
-    - Dataset overview metrics
     **📥 Export Options:**
-    - **HTML**: Interactive report with embedded charts
-    - **PDF**: Professional report for presentations
     - **Markdown**: Simple text format for documentation
     **⚡ Speed Optimization:**
@@ -821,13 +736,6 @@ with gr.Blocks(
     **🔧 Supported Formats:** CSV, XLSX, XLS | **📏 Max Size:** 50MB | **🚀 Response Time:** ~3-5 seconds
     """)
-def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
-    """Synchronous wrapper for the async analyze function"""
-    return asyncio.run(analyze_data(file, api_key, user_question, progress))
-# Launch configuration
 if __name__ == "__main__":
-    app.queue(max_size=10)  # Handle multiple users
-    app.launch(
-        share=True
-    )

 import aiohttp
 import asyncio
 import json
 import os
 import numpy as np
 import plotly.express as px
 import logging
 from datetime import datetime
 import re
+from jinja2 import Template
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         # Create context-aware prompt
         if user_question:
             prompt = f"""You are a data analyst expert. Based on this dataset:
 {data_summary}
 User's specific question: {user_question}
 Provide a detailed, actionable answer with specific data points and recommendations."""
         else:
             prompt = f"""You are a senior data analyst. Analyze this dataset thoroughly:
 {data_summary}
 Provide a comprehensive analysis including:
 1. **Key Statistical Insights**: Most important numbers and what they mean
 2. **Patterns & Trends**: Notable patterns, correlations, or anomalies
 3. **Data Quality Assessment**: Missing values, outliers, data consistency
 4. **Business Intelligence**: Actionable insights and opportunities
 5. **Recommendations**: Specific next steps or areas to investigate
 Format your response with clear sections and bullet points for readability."""
         body = {
             ],
             "stream": True,
             "max_tokens": 3000,
+            "temperature": 0.2,
             "top_p": 0.9
         }
         try:
+            timeout = aiohttp.ClientTimeout(total=30)
             async with aiohttp.ClientSession(timeout=timeout) as session:
                 async with session.post(self.api_base_url, headers=headers, json=body) as response:
                     if response.status == 401:
         try:
             file_extension = os.path.splitext(file_path)[1].lower()
             if file_extension == '.csv':
                 for encoding in ['utf-8', 'latin-1', 'cp1252']:
                     try:
                         df = pd.read_csv(file_path, encoding=encoding)
             else:
                 raise ValueError("Unsupported file format. Please upload CSV or Excel files.")
             df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
             self.current_df = df
             data_summary = self.generate_enhanced_summary(df)
             charts_html = self.generate_visualizations(df)
     def generate_enhanced_summary(self, df: pd.DataFrame) -> str:
         """Generate comprehensive data summary with statistical insights"""
         summary = []
         summary.append(f"# 📊 Dataset Analysis Report")
         summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
         summary.append(f"**File Size**: {df.shape[0]:,} rows × {df.shape[1]} columns")
         memory_usage = df.memory_usage(deep=True).sum() / 1024**2
         summary.append(f"**Memory Usage**: {memory_usage:.2f} MB\n")
         type_counts = df.dtypes.value_counts()
         summary.append("## 📋 Column Types:")
         for dtype, count in type_counts.items():
             summary.append(f"- **{dtype}**: {count} columns")
         missing_data = df.isnull().sum()
         missing_pct = (missing_data / len(df) * 100).round(2)
         missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
         else:
             summary.append("\n## ✅ Data Quality: No missing values detected!")
         numeric_cols = df.select_dtypes(include=[np.number]).columns
         if len(numeric_cols) > 0:
             summary.append(f"\n## 📈 Numerical Columns Analysis ({len(numeric_cols)} columns):")
+            for col in numeric_cols[:10]:
                 stats = df[col].describe()
                 outliers = len(df[df[col] > (stats['75%'] + 1.5 * (stats['75%'] - stats['25%']))])
                 summary.append(f"- **{col}**: μ={stats['mean']:.2f}, σ={stats['std']:.2f}, outliers={outliers}")
         categorical_cols = df.select_dtypes(include=['object', 'category']).columns
         if len(categorical_cols) > 0:
             summary.append(f"\n## 📝 Categorical Columns Analysis ({len(categorical_cols)} columns):")
+            for col in categorical_cols[:10]:
                 unique_count = df[col].nunique()
                 cardinality = "High" if unique_count > len(df) * 0.9 else "Medium" if unique_count > 10 else "Low"
                 most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
                 summary.append(f"- **{col}**: {unique_count:,} unique values ({cardinality} cardinality), Top: '{most_common}'")
         summary.append("\n## 🔍 Data Sample (First 3 Rows):")
         sample_df = df.head(3)
         for idx, row in sample_df.iterrows():
         charts_html = []
         try:
             missing_data = df.isnull().sum()
             if missing_data.sum() > 0:
                 fig = px.bar(
                 charts_html.append(f"<h3>📊 Data Quality Overview</h3>")
                 charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_chart"))
             numeric_cols = df.select_dtypes(include=[np.number]).columns
             if len(numeric_cols) > 1:
                 corr_matrix = df[numeric_cols].corr()
                 charts_html.append(f"<h3>📈 Correlation Analysis</h3>")
                 charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_chart"))
             if len(numeric_cols) > 0:
+                for i, col in enumerate(numeric_cols[:3]):
                     fig = px.histogram(
                         df,
                         x=col,
                         charts_html.append(f"<h3>📈 Data Distributions</h3>")
                     charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"dist_chart_{i}"))
             categorical_cols = df.select_dtypes(include=['object', 'category']).columns
             if len(categorical_cols) > 0:
+                for i, col in enumerate(categorical_cols[:2]):
+                    if df[col].nunique() <= 20:
                         value_counts = df[col].value_counts().head(10)
                         fig = px.bar(
                             x=value_counts.values,
                             charts_html.append(f"<h3>📝 Categorical Data Analysis</h3>")
                         charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"cat_chart_{i}"))
             summary_data = {
                 'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 'Missing Values'],
                 'Count': [
             charts_html.append(f"<h3>📊 Dataset Overview</h3>")
             charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_chart"))
             self.current_charts = charts_html
             return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
         except Exception as e:
             return f"<p>❌ Chart generation failed: {str(e)}</p>"
     def generate_report_html(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
+        """Generate HTML report with embedded charts and print button"""
         html_template = """
         <!DOCTYPE html>
         <html>
                     border-radius: 8px;
                     border-left: 4px solid #667eea;
                 }
+                h1, h2, h3 {
+                    color: #2c3e50;
+                    margin-top: 20px;
+                    margin-bottom: 15px;
+                }
                 .metadata {
                     background: #e8f4f8;
                     padding: 15px;
                     border-radius: 5px;
                     overflow-x: auto;
                     white-space: pre-wrap;
+                    font-size: 14px;
+                }
+                strong {
+                    color: #2c3e50;
+                    font-weight: 600;
+                }
+                .print-button {
+                    background: #667eea;
+                    color: white;
+                    padding: 10px 20px;
+                    border: none;
+                    border-radius: 5px;
+                    cursor: pointer;
+                    font-size: 16px;
+                    margin: 10px 0;
+                    display: inline-block;
+                }
+                .print-button:hover {
+                    background: #764ba2;
+                }
+                @media print {
+                    .print-button {
+                        display: none;
+                    }
+                    body {
+                        background: white;
+                    }
+                    .section, .metadata, .footer {
+                        box-shadow: none;
+                    }
                 }
             </style>
+            <script>
+                function printReport() {
+                    window.print();
+                }
+            </script>
         </head>
         <body>
             <div class="header">
             <div class="section">
                 <h2>🎯 AI Analysis & Insights</h2>
+                <button class="print-button" onclick="printReport()">🖨️ Print as PDF</button>
                 <div>{{ ai_analysis }}</div>
             </div>
         """
         template = Template(html_template)
+        ai_analysis_html = analysis_text
         charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
         return template.render(
             data_summary=data_summary
         )
 analyzer = EnhancedDataAnalyzer()
 async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
     if not file:
         return "❌ Please upload a CSV or Excel file.", "", "", "", None
     if not analyzer.validate_api_key(api_key):
         return "❌ Please enter a valid Chutes API key (minimum 10 characters).", "", "", "", None
     is_valid, validation_msg = analyzer.validate_file(file)
     if not is_valid:
         return f"❌ {validation_msg}", "", "", "", None
     progress(0.1, desc="📁 Reading file...")
     try:
         df, data_summary, charts_html = analyzer.process_file(file.name)
         progress(0.3, desc="📊 Processing data...")
         progress(0.5, desc="🤖 Generating AI insights...")
         ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
         progress(0.9, desc="✨ Finalizing results...")
         response = f"""# 🎯 Analysis Complete!
 {ai_analysis}
 ---
 *Analysis powered by OpenAI gpt-oss-20b via Chutes • Generated at {datetime.now().strftime('%H:%M:%S')}*
 """
         data_preview_html = df.head(15).to_html(
             classes="table table-striped table-hover",
             table_id="data-preview-table",
             escape=False
         )
         styled_preview = f"""
         <style>
             #data-preview-table {{
         return f"❌ **Error**: {str(e)}", "", "", "", None
 def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
     return asyncio.run(analyze_data(file, api_key, user_question, progress))
 def clear_all():
     analyzer.current_df = None
     analyzer.current_charts = None
     return None, "", "", "", "", "", "", None
 def download_report(analysis_text, data_summary, file_name, format_choice):
     if not analysis_text:
         return None, "❌ No analysis data available for download."
     try:
         if format_choice == "HTML":
             html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
             filename = f"{file_base_name}_analysis_report_{timestamp}.html"
             with open(filename, 'w', encoding='utf-8') as f:
                 f.write(html_content)
             return filename, f"✅ HTML report generated successfully! File: {filename}"
+        else:  # Markdown
             report = f"""# Data Analysis Report
 Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
 File: {file_name}
 ## AI Analysis:
 {analysis_text}
 ## Raw Data Summary:
 {data_summary}
 """
             filename = f"{file_base_name}_analysis_report_{timestamp}.md"
             with open(filename, 'w', encoding='utf-8') as f:
                 f.write(report)
             return filename, f"✅ Markdown report generated successfully! File: {filename}"
     except Exception as e:
         logger.error(f"Report generation error: {str(e)}")
         return None, f"❌ Error generating report: {str(e)}"
 with gr.Blocks(
     title="🚀 Smart Data Analyzer Pro",
     theme=gr.themes.Ocean(),
         text-align: center;
         background: #f8f9ff;
     }
     """
 ) as app:
     current_file_name = gr.State("")
     gr.Markdown("""
     # 🚀 Smart Data Analyzer Pro
     ### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
+    Upload your data files and get instant professional insights and downloadable reports!
     """)
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### ⚙️ Configuration")
             api_key_input = gr.Textbox(
                 label="🔑 Chutes API Key",
                 placeholder="sk-chutes-your-api-key-here...",
                 lines=1,
                 info="Get your free API key from chutes.ai"
             )
             file_input = gr.File(
                 label="📁 Upload Data File",
                 file_types=[".csv", ".xlsx", ".xls"],
                 file_count="single",
                 elem_classes=["upload-area"]
             )
             with gr.Row():
                 analyze_btn = gr.Button("🚀 Analyze Data", variant="primary", size="lg")
                 clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
             with gr.Group():
                 gr.Markdown("### 📊 Quick Stats")
                 file_stats = gr.Textbox(
                 )
         with gr.Column(scale=2):
             gr.Markdown("### 🎯 Analysis Results")
             analysis_output = gr.Markdown(
                 value="📋 **Ready to analyze your data!**\n\nUpload a CSV or Excel file and click 'Analyze Data' to get started.",
                 show_label=False
             )
     with gr.Tabs():
         with gr.Tab("💬 Ask Questions"):
             question_input = gr.Textbox(
                 value="<p>Upload a file to see data preview...</p>"
             )
         with gr.Tab("🔍 Raw Summary"):
             raw_summary = gr.Textbox(
                 label="Detailed Data Summary",
         with gr.Tab("💾 Export Reports"):
             gr.Markdown("### 📥 Download Your Analysis Report")
             with gr.Row():
                 format_choice = gr.Radio(
+                    choices=["HTML", "Markdown"],
                     value="HTML",
                     label="📄 Report Format",
                     info="Choose your preferred download format"
                 )
             download_btn = gr.Button("📥 Generate & Download Report", variant="primary", size="lg")
             download_status = gr.Textbox(label="Download Status", interactive=False)
             download_file = gr.File(label="📄 Download Link", visible=True)
     def update_file_stats(file):
         if not file:
             return "No file uploaded"
         try:
+            file_size = os.path.getsize(file.name) / (1024 * 1024)
             file_name = os.path.basename(file.name)
             return f"📄 **File**: {file_name}\n📏 **Size**: {file_size:.2f} MB\n⏰ **Uploaded**: {datetime.now().strftime('%H:%M:%S')}"
         except:
             return "File information unavailable"
     def handle_analysis(file, api_key, user_question="", progress=gr.Progress()):
         result = sync_analyze_data(file, api_key, user_question, progress)
+        if len(result) == 5:
+            return result[0], result[1], result[2], result[4]
         else:
+            return result[0], result[1], result[2], ""
     def handle_question_analysis(file, api_key, question, progress=gr.Progress()):
         if not question.strip():
             return "❓ Please enter a specific question about your data."
         result = sync_analyze_data(file, api_key, question, progress)
+        return result[0]
     analyze_btn.click(
         fn=handle_analysis,
         inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
+        outputs=[analysis_output, raw_summary, data_preview, current_file_name],
         show_progress=True
     )
     ask_btn.click(
         fn=handle_question_analysis,
         inputs=[file_input, api_key_input, question_input],
         show_progress=True
     )
     file_input.change(
         fn=update_file_stats,
         inputs=[file_input],
         outputs=[file_stats]
     )
     clear_btn.click(
         fn=clear_all,
         outputs=[file_input, api_key_input, question_input, analysis_output,
+                question_output, data_preview, raw_summary, current_file_name]
     )
     download_btn.click(
         fn=download_report,
         inputs=[analysis_output, raw_summary, current_file_name, format_choice],
         outputs=[download_file, download_status]
     )
     gr.Markdown("""
     ---
     ### 💡 Pro Tips for Better Analysis:
     - Use descriptive column names
     - Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
     **📥 Export Options:**
+    - **HTML**: Interactive report with embedded charts and print-to-PDF option
     - **Markdown**: Simple text format for documentation
     **⚡ Speed Optimization:**
     **🔧 Supported Formats:** CSV, XLSX, XLS | **📏 Max Size:** 50MB | **🚀 Response Time:** ~3-5 seconds
     """)
 if __name__ == "__main__":
+    app.queue(max_size=10)
+    app.launch()