shukdevdattaEX commited on
Commit
939abbc
Β·
verified Β·
1 Parent(s): 929709a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -156
app.py CHANGED
@@ -3,7 +3,6 @@ import pandas as pd
3
  import aiohttp
4
  import asyncio
5
  import json
6
- import io
7
  import os
8
  import numpy as np
9
  import plotly.express as px
@@ -12,10 +11,7 @@ from typing import Optional, Tuple, Dict, Any
12
  import logging
13
  from datetime import datetime
14
  import re
15
- import base64
16
- from io import BytesIO
17
- import weasyprint # For PDF generation
18
- from jinja2 import Template # For HTML templating
19
 
20
  # Configure logging
21
  logging.basicConfig(level=logging.INFO)
@@ -58,25 +54,18 @@ class EnhancedDataAnalyzer:
58
  # Create context-aware prompt
59
  if user_question:
60
  prompt = f"""You are a data analyst expert. Based on this dataset:
61
-
62
  {data_summary}
63
-
64
  User's specific question: {user_question}
65
-
66
  Provide a detailed, actionable answer with specific data points and recommendations."""
67
  else:
68
  prompt = f"""You are a senior data analyst. Analyze this dataset thoroughly:
69
-
70
  {data_summary}
71
-
72
  Provide a comprehensive analysis including:
73
-
74
  1. **Key Statistical Insights**: Most important numbers and what they mean
75
  2. **Patterns & Trends**: Notable patterns, correlations, or anomalies
76
  3. **Data Quality Assessment**: Missing values, outliers, data consistency
77
  4. **Business Intelligence**: Actionable insights and opportunities
78
  5. **Recommendations**: Specific next steps or areas to investigate
79
-
80
  Format your response with clear sections and bullet points for readability."""
81
 
82
  body = {
@@ -93,12 +82,12 @@ Format your response with clear sections and bullet points for readability."""
93
  ],
94
  "stream": True,
95
  "max_tokens": 3000,
96
- "temperature": 0.2, # Very low for consistent analysis
97
  "top_p": 0.9
98
  }
99
 
100
  try:
101
- timeout = aiohttp.ClientTimeout(total=30) # 30 second timeout
102
  async with aiohttp.ClientSession(timeout=timeout) as session:
103
  async with session.post(self.api_base_url, headers=headers, json=body) as response:
104
  if response.status == 401:
@@ -138,9 +127,7 @@ Format your response with clear sections and bullet points for readability."""
138
  try:
139
  file_extension = os.path.splitext(file_path)[1].lower()
140
 
141
- # Read file with better error handling
142
  if file_extension == '.csv':
143
- # Try different encodings
144
  for encoding in ['utf-8', 'latin-1', 'cp1252']:
145
  try:
146
  df = pd.read_csv(file_path, encoding=encoding)
@@ -154,13 +141,8 @@ Format your response with clear sections and bullet points for readability."""
154
  else:
155
  raise ValueError("Unsupported file format. Please upload CSV or Excel files.")
156
 
157
- # Clean column names
158
  df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
159
-
160
- # Store dataframe for visualizations
161
  self.current_df = df
162
-
163
- # Generate enhanced summaries
164
  data_summary = self.generate_enhanced_summary(df)
165
  charts_html = self.generate_visualizations(df)
166
 
@@ -172,23 +154,17 @@ Format your response with clear sections and bullet points for readability."""
172
  def generate_enhanced_summary(self, df: pd.DataFrame) -> str:
173
  """Generate comprehensive data summary with statistical insights"""
174
  summary = []
175
-
176
- # Header with timestamp
177
  summary.append(f"# πŸ“Š Dataset Analysis Report")
178
  summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
179
  summary.append(f"**File Size**: {df.shape[0]:,} rows Γ— {df.shape[1]} columns")
180
-
181
- # Memory usage
182
  memory_usage = df.memory_usage(deep=True).sum() / 1024**2
183
  summary.append(f"**Memory Usage**: {memory_usage:.2f} MB\n")
184
 
185
- # Data types breakdown
186
  type_counts = df.dtypes.value_counts()
187
  summary.append("## πŸ“‹ Column Types:")
188
  for dtype, count in type_counts.items():
189
  summary.append(f"- **{dtype}**: {count} columns")
190
 
191
- # Missing data analysis
192
  missing_data = df.isnull().sum()
193
  missing_pct = (missing_data / len(df) * 100).round(2)
194
  missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
@@ -201,26 +177,23 @@ Format your response with clear sections and bullet points for readability."""
201
  else:
202
  summary.append("\n## βœ… Data Quality: No missing values detected!")
203
 
204
- # Numerical analysis
205
  numeric_cols = df.select_dtypes(include=[np.number]).columns
206
  if len(numeric_cols) > 0:
207
  summary.append(f"\n## πŸ“ˆ Numerical Columns Analysis ({len(numeric_cols)} columns):")
208
- for col in numeric_cols[:10]: # Limit to first 10
209
  stats = df[col].describe()
210
  outliers = len(df[df[col] > (stats['75%'] + 1.5 * (stats['75%'] - stats['25%']))])
211
  summary.append(f"- **{col}**: ΞΌ={stats['mean']:.2f}, Οƒ={stats['std']:.2f}, outliers={outliers}")
212
 
213
- # Categorical analysis
214
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
215
  if len(categorical_cols) > 0:
216
  summary.append(f"\n## πŸ“ Categorical Columns Analysis ({len(categorical_cols)} columns):")
217
- for col in categorical_cols[:10]: # Limit to first 10
218
  unique_count = df[col].nunique()
219
  cardinality = "High" if unique_count > len(df) * 0.9 else "Medium" if unique_count > 10 else "Low"
220
  most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
221
  summary.append(f"- **{col}**: {unique_count:,} unique values ({cardinality} cardinality), Top: '{most_common}'")
222
 
223
- # Sample data with better formatting
224
  summary.append("\n## πŸ” Data Sample (First 3 Rows):")
225
  sample_df = df.head(3)
226
  for idx, row in sample_df.iterrows():
@@ -235,7 +208,6 @@ Format your response with clear sections and bullet points for readability."""
235
  charts_html = []
236
 
237
  try:
238
- # Chart 1: Data completeness analysis
239
  missing_data = df.isnull().sum()
240
  if missing_data.sum() > 0:
241
  fig = px.bar(
@@ -255,7 +227,6 @@ Format your response with clear sections and bullet points for readability."""
255
  charts_html.append(f"<h3>πŸ“Š Data Quality Overview</h3>")
256
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_chart"))
257
 
258
- # Chart 2: Numerical columns correlation heatmap
259
  numeric_cols = df.select_dtypes(include=[np.number]).columns
260
  if len(numeric_cols) > 1:
261
  corr_matrix = df[numeric_cols].corr()
@@ -270,9 +241,8 @@ Format your response with clear sections and bullet points for readability."""
270
  charts_html.append(f"<h3>πŸ“ˆ Correlation Analysis</h3>")
271
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_chart"))
272
 
273
- # Chart 3: Distribution plots for numerical columns
274
  if len(numeric_cols) > 0:
275
- for i, col in enumerate(numeric_cols[:3]): # First 3 numeric columns
276
  fig = px.histogram(
277
  df,
278
  x=col,
@@ -285,11 +255,10 @@ Format your response with clear sections and bullet points for readability."""
285
  charts_html.append(f"<h3>πŸ“ˆ Data Distributions</h3>")
286
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"dist_chart_{i}"))
287
 
288
- # Chart 4: Categorical analysis
289
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
290
  if len(categorical_cols) > 0:
291
- for i, col in enumerate(categorical_cols[:2]): # First 2 categorical columns
292
- if df[col].nunique() <= 20: # Only if reasonable number of categories
293
  value_counts = df[col].value_counts().head(10)
294
  fig = px.bar(
295
  x=value_counts.values,
@@ -303,7 +272,6 @@ Format your response with clear sections and bullet points for readability."""
303
  charts_html.append(f"<h3>πŸ“ Categorical Data Analysis</h3>")
304
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"cat_chart_{i}"))
305
 
306
- # Chart 5: Data overview summary
307
  summary_data = {
308
  'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 'Missing Values'],
309
  'Count': [
@@ -327,9 +295,7 @@ Format your response with clear sections and bullet points for readability."""
327
  charts_html.append(f"<h3>πŸ“Š Dataset Overview</h3>")
328
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_chart"))
329
 
330
- # Store charts for export
331
  self.current_charts = charts_html
332
-
333
  return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
334
 
335
  except Exception as e:
@@ -337,8 +303,7 @@ Format your response with clear sections and bullet points for readability."""
337
  return f"<p>❌ Chart generation failed: {str(e)}</p>"
338
 
339
  def generate_report_html(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
340
- """Generate HTML report with embedded charts"""
341
-
342
  html_template = """
343
  <!DOCTYPE html>
344
  <html>
@@ -377,7 +342,11 @@ Format your response with clear sections and bullet points for readability."""
377
  border-radius: 8px;
378
  border-left: 4px solid #667eea;
379
  }
380
- h1, h2, h3 { color: #2c3e50; }
 
 
 
 
381
  .metadata {
382
  background: #e8f4f8;
383
  padding: 15px;
@@ -398,8 +367,43 @@ Format your response with clear sections and bullet points for readability."""
398
  border-radius: 5px;
399
  overflow-x: auto;
400
  white-space: pre-wrap;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  }
402
  </style>
 
 
 
 
 
403
  </head>
404
  <body>
405
  <div class="header">
@@ -415,6 +419,7 @@ Format your response with clear sections and bullet points for readability."""
415
 
416
  <div class="section">
417
  <h2>🎯 AI Analysis & Insights</h2>
 
418
  <div>{{ ai_analysis }}</div>
419
  </div>
420
 
@@ -439,13 +444,7 @@ Format your response with clear sections and bullet points for readability."""
439
  """
440
 
441
  template = Template(html_template)
442
-
443
- # Convert markdown to HTML for AI analysis
444
- ai_analysis_html = analysis_text.replace('\n', '<br>')
445
- ai_analysis_html = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', ai_analysis_html)
446
- ai_analysis_html = re.sub(r'## (.*?)\n', r'<h3>\1</h3>', ai_analysis_html)
447
- ai_analysis_html = re.sub(r'# (.*?)\n', r'<h2>\1</h2>', ai_analysis_html)
448
-
449
  charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
450
 
451
  return template.render(
@@ -456,52 +455,37 @@ Format your response with clear sections and bullet points for readability."""
456
  data_summary=data_summary
457
  )
458
 
459
- # Initialize the analyzer
460
  analyzer = EnhancedDataAnalyzer()
461
 
462
  async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
463
- """Enhanced analysis function with progress tracking"""
464
  if not file:
465
  return "❌ Please upload a CSV or Excel file.", "", "", "", None
466
 
467
  if not analyzer.validate_api_key(api_key):
468
  return "❌ Please enter a valid Chutes API key (minimum 10 characters).", "", "", "", None
469
 
470
- # Validate file
471
  is_valid, validation_msg = analyzer.validate_file(file)
472
  if not is_valid:
473
  return f"❌ {validation_msg}", "", "", "", None
474
 
475
  progress(0.1, desc="πŸ“ Reading file...")
476
-
477
  try:
478
- # Process the uploaded file
479
  df, data_summary, charts_html = analyzer.process_file(file.name)
480
  progress(0.3, desc="πŸ“Š Processing data...")
481
-
482
  progress(0.5, desc="πŸ€– Generating AI insights...")
483
-
484
- # Get AI analysis
485
  ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
486
  progress(0.9, desc="✨ Finalizing results...")
487
 
488
- # Format the complete response
489
  response = f"""# 🎯 Analysis Complete!
490
-
491
  {ai_analysis}
492
-
493
  ---
494
  *Analysis powered by OpenAI gpt-oss-20b via Chutes β€’ Generated at {datetime.now().strftime('%H:%M:%S')}*
495
  """
496
-
497
- # Generate data preview
498
  data_preview_html = df.head(15).to_html(
499
  classes="table table-striped table-hover",
500
  table_id="data-preview-table",
501
  escape=False
502
  )
503
-
504
- # Add some styling to the preview
505
  styled_preview = f"""
506
  <style>
507
  #data-preview-table {{
@@ -536,17 +520,14 @@ async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
536
  return f"❌ **Error**: {str(e)}", "", "", "", None
537
 
538
  def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
539
- """Synchronous wrapper for the async analyze function"""
540
  return asyncio.run(analyze_data(file, api_key, user_question, progress))
541
 
542
  def clear_all():
543
- """Clear all inputs and outputs"""
544
  analyzer.current_df = None
545
  analyzer.current_charts = None
546
  return None, "", "", "", "", "", "", None
547
 
548
  def download_report(analysis_text, data_summary, file_name, format_choice):
549
- """Generate downloadable report in PDF or HTML format"""
550
  if not analysis_text:
551
  return None, "❌ No analysis data available for download."
552
 
@@ -555,47 +536,30 @@ def download_report(analysis_text, data_summary, file_name, format_choice):
555
 
556
  try:
557
  if format_choice == "HTML":
558
- # Generate HTML report
559
  html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
560
  filename = f"{file_base_name}_analysis_report_{timestamp}.html"
561
-
562
  with open(filename, 'w', encoding='utf-8') as f:
563
  f.write(html_content)
564
-
565
  return filename, f"βœ… HTML report generated successfully! File: {filename}"
566
 
567
- elif format_choice == "PDF":
568
- # Generate PDF report
569
- html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
570
- filename = f"{file_base_name}_analysis_report_{timestamp}.pdf"
571
-
572
- # Convert HTML to PDF using weasyprint
573
- weasyprint.HTML(string=html_content).write_pdf(filename)
574
-
575
- return filename, f"βœ… PDF report generated successfully! File: {filename}"
576
-
577
- else: # Markdown fallback
578
  report = f"""# Data Analysis Report
579
  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
580
  File: {file_name}
581
-
582
  ## AI Analysis:
583
  {analysis_text}
584
-
585
  ## Raw Data Summary:
586
  {data_summary}
587
  """
588
  filename = f"{file_base_name}_analysis_report_{timestamp}.md"
589
  with open(filename, 'w', encoding='utf-8') as f:
590
  f.write(report)
591
-
592
  return filename, f"βœ… Markdown report generated successfully! File: {filename}"
593
 
594
  except Exception as e:
595
  logger.error(f"Report generation error: {str(e)}")
596
  return None, f"❌ Error generating report: {str(e)}"
597
 
598
- # Create enhanced Gradio interface
599
  with gr.Blocks(
600
  title="πŸš€ Smart Data Analyzer Pro",
601
  theme=gr.themes.Ocean(),
@@ -613,33 +577,20 @@ with gr.Blocks(
613
  text-align: center;
614
  background: #f8f9ff;
615
  }
616
- .charts-container {
617
- max-height: 800px;
618
- overflow-y: auto;
619
- padding: 10px;
620
- background: #fafafa;
621
- border-radius: 8px;
622
- }
623
  """
624
  ) as app:
625
-
626
- # Store file name for downloads
627
  current_file_name = gr.State("")
628
 
629
- # Header
630
  gr.Markdown("""
631
  # πŸš€ Smart Data Analyzer Pro
632
  ### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
633
 
634
- Upload your data files and get instant professional insights, visualizations, and downloadable reports!
635
  """)
636
 
637
- # Main interface
638
  with gr.Row():
639
  with gr.Column(scale=1):
640
- # Configuration section
641
  gr.Markdown("### βš™οΈ Configuration")
642
-
643
  api_key_input = gr.Textbox(
644
  label="πŸ”‘ Chutes API Key",
645
  placeholder="sk-chutes-your-api-key-here...",
@@ -647,19 +598,15 @@ with gr.Blocks(
647
  lines=1,
648
  info="Get your free API key from chutes.ai"
649
  )
650
-
651
  file_input = gr.File(
652
  label="πŸ“ Upload Data File",
653
  file_types=[".csv", ".xlsx", ".xls"],
654
  file_count="single",
655
  elem_classes=["upload-area"]
656
  )
657
-
658
  with gr.Row():
659
  analyze_btn = gr.Button("πŸš€ Analyze Data", variant="primary", size="lg")
660
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
661
-
662
- # Quick stats display
663
  with gr.Group():
664
  gr.Markdown("### πŸ“Š Quick Stats")
665
  file_stats = gr.Textbox(
@@ -670,15 +617,12 @@ with gr.Blocks(
670
  )
671
 
672
  with gr.Column(scale=2):
673
- # Results section
674
  gr.Markdown("### 🎯 Analysis Results")
675
-
676
  analysis_output = gr.Markdown(
677
  value="πŸ“‹ **Ready to analyze your data!**\n\nUpload a CSV or Excel file and click 'Analyze Data' to get started.",
678
  show_label=False
679
  )
680
 
681
- # Advanced features in tabs
682
  with gr.Tabs():
683
  with gr.Tab("πŸ’¬ Ask Questions"):
684
  question_input = gr.Textbox(
@@ -695,14 +639,6 @@ with gr.Blocks(
695
  value="<p>Upload a file to see data preview...</p>"
696
  )
697
 
698
- with gr.Tab("πŸ“ˆ Visualizations"):
699
- charts_output = gr.HTML(
700
- label="Auto-Generated Charts",
701
- value="<div class='charts-container'><p>πŸ“Š Interactive charts will appear here after analysis...</p></div>",
702
- elem_classes=["charts-container"],
703
- visible=False
704
- )
705
-
706
  with gr.Tab("πŸ” Raw Summary"):
707
  raw_summary = gr.Textbox(
708
  label="Detailed Data Summary",
@@ -713,56 +649,47 @@ with gr.Blocks(
713
 
714
  with gr.Tab("πŸ’Ύ Export Reports"):
715
  gr.Markdown("### πŸ“₯ Download Your Analysis Report")
716
-
717
  with gr.Row():
718
  format_choice = gr.Radio(
719
- choices=["HTML", "PDF", "Markdown"],
720
  value="HTML",
721
  label="πŸ“„ Report Format",
722
  info="Choose your preferred download format"
723
  )
724
-
725
  download_btn = gr.Button("πŸ“₯ Generate & Download Report", variant="primary", size="lg")
726
  download_status = gr.Textbox(label="Download Status", interactive=False)
727
  download_file = gr.File(label="πŸ“„ Download Link", visible=True)
728
 
729
- # Event handlers
730
  def update_file_stats(file):
731
  if not file:
732
  return "No file uploaded"
733
-
734
  try:
735
- file_size = os.path.getsize(file.name) / (1024 * 1024) # MB
736
  file_name = os.path.basename(file.name)
737
  return f"πŸ“„ **File**: {file_name}\nπŸ“ **Size**: {file_size:.2f} MB\n⏰ **Uploaded**: {datetime.now().strftime('%H:%M:%S')}"
738
  except:
739
  return "File information unavailable"
740
 
741
  def handle_analysis(file, api_key, user_question="", progress=gr.Progress()):
742
- """Handle main analysis and return all outputs including file name"""
743
  result = sync_analyze_data(file, api_key, user_question, progress)
744
- if len(result) == 5: # Check if file name was returned
745
- return result[0], result[1], result[2], result[3], result[4] # analysis, summary, preview, charts, filename
746
  else:
747
- return result[0], result[1], result[2], result[3], "" # fallback without filename
748
 
749
  def handle_question_analysis(file, api_key, question, progress=gr.Progress()):
750
- """Handle question-specific analysis"""
751
  if not question.strip():
752
  return "❓ Please enter a specific question about your data."
753
-
754
  result = sync_analyze_data(file, api_key, question, progress)
755
- return result[0] # Return only the analysis output
756
 
757
- # Main analysis event
758
  analyze_btn.click(
759
  fn=handle_analysis,
760
  inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
761
- outputs=[analysis_output, raw_summary, data_preview, charts_output, current_file_name],
762
  show_progress=True
763
  )
764
 
765
- # Follow-up questions
766
  ask_btn.click(
767
  fn=handle_question_analysis,
768
  inputs=[file_input, api_key_input, question_input],
@@ -770,28 +697,24 @@ with gr.Blocks(
770
  show_progress=True
771
  )
772
 
773
- # File stats update
774
  file_input.change(
775
  fn=update_file_stats,
776
  inputs=[file_input],
777
  outputs=[file_stats]
778
  )
779
 
780
- # Clear functionality
781
  clear_btn.click(
782
  fn=clear_all,
783
  outputs=[file_input, api_key_input, question_input, analysis_output,
784
- question_output, data_preview, charts_output, raw_summary]
785
  )
786
 
787
- # Enhanced download functionality
788
  download_btn.click(
789
  fn=download_report,
790
  inputs=[analysis_output, raw_summary, current_file_name, format_choice],
791
  outputs=[download_file, download_status]
792
  )
793
 
794
- # Footer with usage tips
795
  gr.Markdown("""
796
  ---
797
  ### πŸ’‘ Pro Tips for Better Analysis:
@@ -801,16 +724,8 @@ with gr.Blocks(
801
  - Use descriptive column names
802
  - Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
803
 
804
- **πŸ“Š Visualizations Include:**
805
- - Missing data analysis
806
- - Correlation matrices for numerical data
807
- - Distribution plots and histograms
808
- - Top categories for categorical data
809
- - Dataset overview metrics
810
-
811
  **πŸ“₯ Export Options:**
812
- - **HTML**: Interactive report with embedded charts
813
- - **PDF**: Professional report for presentations
814
  - **Markdown**: Simple text format for documentation
815
 
816
  **⚑ Speed Optimization:**
@@ -821,13 +736,6 @@ with gr.Blocks(
821
  **πŸ”§ Supported Formats:** CSV, XLSX, XLS | **πŸ“ Max Size:** 50MB | **πŸš€ Response Time:** ~3-5 seconds
822
  """)
823
 
824
- def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
825
- """Synchronous wrapper for the async analyze function"""
826
- return asyncio.run(analyze_data(file, api_key, user_question, progress))
827
-
828
- # Launch configuration
829
  if __name__ == "__main__":
830
- app.queue(max_size=10) # Handle multiple users
831
- app.launch(
832
- share=True
833
- )
 
3
  import aiohttp
4
  import asyncio
5
  import json
 
6
  import os
7
  import numpy as np
8
  import plotly.express as px
 
11
  import logging
12
  from datetime import datetime
13
  import re
14
+ from jinja2 import Template
 
 
 
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
 
54
  # Create context-aware prompt
55
  if user_question:
56
  prompt = f"""You are a data analyst expert. Based on this dataset:
 
57
  {data_summary}
 
58
  User's specific question: {user_question}
 
59
  Provide a detailed, actionable answer with specific data points and recommendations."""
60
  else:
61
  prompt = f"""You are a senior data analyst. Analyze this dataset thoroughly:
 
62
  {data_summary}
 
63
  Provide a comprehensive analysis including:
 
64
  1. **Key Statistical Insights**: Most important numbers and what they mean
65
  2. **Patterns & Trends**: Notable patterns, correlations, or anomalies
66
  3. **Data Quality Assessment**: Missing values, outliers, data consistency
67
  4. **Business Intelligence**: Actionable insights and opportunities
68
  5. **Recommendations**: Specific next steps or areas to investigate
 
69
  Format your response with clear sections and bullet points for readability."""
70
 
71
  body = {
 
82
  ],
83
  "stream": True,
84
  "max_tokens": 3000,
85
+ "temperature": 0.2,
86
  "top_p": 0.9
87
  }
88
 
89
  try:
90
+ timeout = aiohttp.ClientTimeout(total=30)
91
  async with aiohttp.ClientSession(timeout=timeout) as session:
92
  async with session.post(self.api_base_url, headers=headers, json=body) as response:
93
  if response.status == 401:
 
127
  try:
128
  file_extension = os.path.splitext(file_path)[1].lower()
129
 
 
130
  if file_extension == '.csv':
 
131
  for encoding in ['utf-8', 'latin-1', 'cp1252']:
132
  try:
133
  df = pd.read_csv(file_path, encoding=encoding)
 
141
  else:
142
  raise ValueError("Unsupported file format. Please upload CSV or Excel files.")
143
 
 
144
  df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
 
 
145
  self.current_df = df
 
 
146
  data_summary = self.generate_enhanced_summary(df)
147
  charts_html = self.generate_visualizations(df)
148
 
 
154
  def generate_enhanced_summary(self, df: pd.DataFrame) -> str:
155
  """Generate comprehensive data summary with statistical insights"""
156
  summary = []
 
 
157
  summary.append(f"# πŸ“Š Dataset Analysis Report")
158
  summary.append(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
159
  summary.append(f"**File Size**: {df.shape[0]:,} rows Γ— {df.shape[1]} columns")
 
 
160
  memory_usage = df.memory_usage(deep=True).sum() / 1024**2
161
  summary.append(f"**Memory Usage**: {memory_usage:.2f} MB\n")
162
 
 
163
  type_counts = df.dtypes.value_counts()
164
  summary.append("## πŸ“‹ Column Types:")
165
  for dtype, count in type_counts.items():
166
  summary.append(f"- **{dtype}**: {count} columns")
167
 
 
168
  missing_data = df.isnull().sum()
169
  missing_pct = (missing_data / len(df) * 100).round(2)
170
  missing_summary = missing_data[missing_data > 0].sort_values(ascending=False)
 
177
  else:
178
  summary.append("\n## βœ… Data Quality: No missing values detected!")
179
 
 
180
  numeric_cols = df.select_dtypes(include=[np.number]).columns
181
  if len(numeric_cols) > 0:
182
  summary.append(f"\n## πŸ“ˆ Numerical Columns Analysis ({len(numeric_cols)} columns):")
183
+ for col in numeric_cols[:10]:
184
  stats = df[col].describe()
185
  outliers = len(df[df[col] > (stats['75%'] + 1.5 * (stats['75%'] - stats['25%']))])
186
  summary.append(f"- **{col}**: ΞΌ={stats['mean']:.2f}, Οƒ={stats['std']:.2f}, outliers={outliers}")
187
 
 
188
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
189
  if len(categorical_cols) > 0:
190
  summary.append(f"\n## πŸ“ Categorical Columns Analysis ({len(categorical_cols)} columns):")
191
+ for col in categorical_cols[:10]:
192
  unique_count = df[col].nunique()
193
  cardinality = "High" if unique_count > len(df) * 0.9 else "Medium" if unique_count > 10 else "Low"
194
  most_common = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else "N/A"
195
  summary.append(f"- **{col}**: {unique_count:,} unique values ({cardinality} cardinality), Top: '{most_common}'")
196
 
 
197
  summary.append("\n## πŸ” Data Sample (First 3 Rows):")
198
  sample_df = df.head(3)
199
  for idx, row in sample_df.iterrows():
 
208
  charts_html = []
209
 
210
  try:
 
211
  missing_data = df.isnull().sum()
212
  if missing_data.sum() > 0:
213
  fig = px.bar(
 
227
  charts_html.append(f"<h3>πŸ“Š Data Quality Overview</h3>")
228
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="missing_data_chart"))
229
 
 
230
  numeric_cols = df.select_dtypes(include=[np.number]).columns
231
  if len(numeric_cols) > 1:
232
  corr_matrix = df[numeric_cols].corr()
 
241
  charts_html.append(f"<h3>πŸ“ˆ Correlation Analysis</h3>")
242
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="correlation_chart"))
243
 
 
244
  if len(numeric_cols) > 0:
245
+ for i, col in enumerate(numeric_cols[:3]):
246
  fig = px.histogram(
247
  df,
248
  x=col,
 
255
  charts_html.append(f"<h3>πŸ“ˆ Data Distributions</h3>")
256
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"dist_chart_{i}"))
257
 
 
258
  categorical_cols = df.select_dtypes(include=['object', 'category']).columns
259
  if len(categorical_cols) > 0:
260
+ for i, col in enumerate(categorical_cols[:2]):
261
+ if df[col].nunique() <= 20:
262
  value_counts = df[col].value_counts().head(10)
263
  fig = px.bar(
264
  x=value_counts.values,
 
272
  charts_html.append(f"<h3>πŸ“ Categorical Data Analysis</h3>")
273
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id=f"cat_chart_{i}"))
274
 
 
275
  summary_data = {
276
  'Metric': ['Total Rows', 'Total Columns', 'Numeric Columns', 'Categorical Columns', 'Missing Values'],
277
  'Count': [
 
295
  charts_html.append(f"<h3>πŸ“Š Dataset Overview</h3>")
296
  charts_html.append(fig.to_html(include_plotlyjs='cdn', div_id="overview_chart"))
297
 
 
298
  self.current_charts = charts_html
 
299
  return "\n".join(charts_html) if charts_html else "<p>No charts could be generated for this dataset.</p>"
300
 
301
  except Exception as e:
 
303
  return f"<p>❌ Chart generation failed: {str(e)}</p>"
304
 
305
  def generate_report_html(self, analysis_text: str, data_summary: str, file_name: str = "Unknown") -> str:
306
+ """Generate HTML report with embedded charts and print button"""
 
307
  html_template = """
308
  <!DOCTYPE html>
309
  <html>
 
342
  border-radius: 8px;
343
  border-left: 4px solid #667eea;
344
  }
345
+ h1, h2, h3 {
346
+ color: #2c3e50;
347
+ margin-top: 20px;
348
+ margin-bottom: 15px;
349
+ }
350
  .metadata {
351
  background: #e8f4f8;
352
  padding: 15px;
 
367
  border-radius: 5px;
368
  overflow-x: auto;
369
  white-space: pre-wrap;
370
+ font-size: 14px;
371
+ }
372
+ strong {
373
+ color: #2c3e50;
374
+ font-weight: 600;
375
+ }
376
+ .print-button {
377
+ background: #667eea;
378
+ color: white;
379
+ padding: 10px 20px;
380
+ border: none;
381
+ border-radius: 5px;
382
+ cursor: pointer;
383
+ font-size: 16px;
384
+ margin: 10px 0;
385
+ display: inline-block;
386
+ }
387
+ .print-button:hover {
388
+ background: #764ba2;
389
+ }
390
+ @media print {
391
+ .print-button {
392
+ display: none;
393
+ }
394
+ body {
395
+ background: white;
396
+ }
397
+ .section, .metadata, .footer {
398
+ box-shadow: none;
399
+ }
400
  }
401
  </style>
402
+ <script>
403
+ function printReport() {
404
+ window.print();
405
+ }
406
+ </script>
407
  </head>
408
  <body>
409
  <div class="header">
 
419
 
420
  <div class="section">
421
  <h2>🎯 AI Analysis & Insights</h2>
422
+ <button class="print-button" onclick="printReport()">πŸ–¨οΈ Print as PDF</button>
423
  <div>{{ ai_analysis }}</div>
424
  </div>
425
 
 
444
  """
445
 
446
  template = Template(html_template)
447
+ ai_analysis_html = analysis_text
 
 
 
 
 
 
448
  charts_content = "\n".join(self.current_charts) if self.current_charts else "<p>No visualizations available</p>"
449
 
450
  return template.render(
 
455
  data_summary=data_summary
456
  )
457
 
 
458
  analyzer = EnhancedDataAnalyzer()
459
 
460
  async def analyze_data(file, api_key, user_question="", progress=gr.Progress()):
 
461
  if not file:
462
  return "❌ Please upload a CSV or Excel file.", "", "", "", None
463
 
464
  if not analyzer.validate_api_key(api_key):
465
  return "❌ Please enter a valid Chutes API key (minimum 10 characters).", "", "", "", None
466
 
 
467
  is_valid, validation_msg = analyzer.validate_file(file)
468
  if not is_valid:
469
  return f"❌ {validation_msg}", "", "", "", None
470
 
471
  progress(0.1, desc="πŸ“ Reading file...")
 
472
  try:
 
473
  df, data_summary, charts_html = analyzer.process_file(file.name)
474
  progress(0.3, desc="πŸ“Š Processing data...")
 
475
  progress(0.5, desc="πŸ€– Generating AI insights...")
 
 
476
  ai_analysis = await analyzer.analyze_with_chutes(api_key, data_summary, user_question)
477
  progress(0.9, desc="✨ Finalizing results...")
478
 
 
479
  response = f"""# 🎯 Analysis Complete!
 
480
  {ai_analysis}
 
481
  ---
482
  *Analysis powered by OpenAI gpt-oss-20b via Chutes β€’ Generated at {datetime.now().strftime('%H:%M:%S')}*
483
  """
 
 
484
  data_preview_html = df.head(15).to_html(
485
  classes="table table-striped table-hover",
486
  table_id="data-preview-table",
487
  escape=False
488
  )
 
 
489
  styled_preview = f"""
490
  <style>
491
  #data-preview-table {{
 
520
  return f"❌ **Error**: {str(e)}", "", "", "", None
521
 
522
  def sync_analyze_data(file, api_key, user_question="", progress=gr.Progress()):
 
523
  return asyncio.run(analyze_data(file, api_key, user_question, progress))
524
 
525
  def clear_all():
 
526
  analyzer.current_df = None
527
  analyzer.current_charts = None
528
  return None, "", "", "", "", "", "", None
529
 
530
  def download_report(analysis_text, data_summary, file_name, format_choice):
 
531
  if not analysis_text:
532
  return None, "❌ No analysis data available for download."
533
 
 
536
 
537
  try:
538
  if format_choice == "HTML":
 
539
  html_content = analyzer.generate_report_html(analysis_text, data_summary, file_name)
540
  filename = f"{file_base_name}_analysis_report_{timestamp}.html"
 
541
  with open(filename, 'w', encoding='utf-8') as f:
542
  f.write(html_content)
 
543
  return filename, f"βœ… HTML report generated successfully! File: {filename}"
544
 
545
+ else: # Markdown
 
 
 
 
 
 
 
 
 
 
546
  report = f"""# Data Analysis Report
547
  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
548
  File: {file_name}
 
549
  ## AI Analysis:
550
  {analysis_text}
 
551
  ## Raw Data Summary:
552
  {data_summary}
553
  """
554
  filename = f"{file_base_name}_analysis_report_{timestamp}.md"
555
  with open(filename, 'w', encoding='utf-8') as f:
556
  f.write(report)
 
557
  return filename, f"βœ… Markdown report generated successfully! File: {filename}"
558
 
559
  except Exception as e:
560
  logger.error(f"Report generation error: {str(e)}")
561
  return None, f"❌ Error generating report: {str(e)}"
562
 
 
563
  with gr.Blocks(
564
  title="πŸš€ Smart Data Analyzer Pro",
565
  theme=gr.themes.Ocean(),
 
577
  text-align: center;
578
  background: #f8f9ff;
579
  }
 
 
 
 
 
 
 
580
  """
581
  ) as app:
 
 
582
  current_file_name = gr.State("")
583
 
 
584
  gr.Markdown("""
585
  # πŸš€ Smart Data Analyzer Pro
586
  ### AI-Powered Excel & CSV Analysis with OpenAI gpt-oss-20b
587
 
588
+ Upload your data files and get instant professional insights and downloadable reports!
589
  """)
590
 
 
591
  with gr.Row():
592
  with gr.Column(scale=1):
 
593
  gr.Markdown("### βš™οΈ Configuration")
 
594
  api_key_input = gr.Textbox(
595
  label="πŸ”‘ Chutes API Key",
596
  placeholder="sk-chutes-your-api-key-here...",
 
598
  lines=1,
599
  info="Get your free API key from chutes.ai"
600
  )
 
601
  file_input = gr.File(
602
  label="πŸ“ Upload Data File",
603
  file_types=[".csv", ".xlsx", ".xls"],
604
  file_count="single",
605
  elem_classes=["upload-area"]
606
  )
 
607
  with gr.Row():
608
  analyze_btn = gr.Button("πŸš€ Analyze Data", variant="primary", size="lg")
609
  clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
 
 
610
  with gr.Group():
611
  gr.Markdown("### πŸ“Š Quick Stats")
612
  file_stats = gr.Textbox(
 
617
  )
618
 
619
  with gr.Column(scale=2):
 
620
  gr.Markdown("### 🎯 Analysis Results")
 
621
  analysis_output = gr.Markdown(
622
  value="πŸ“‹ **Ready to analyze your data!**\n\nUpload a CSV or Excel file and click 'Analyze Data' to get started.",
623
  show_label=False
624
  )
625
 
 
626
  with gr.Tabs():
627
  with gr.Tab("πŸ’¬ Ask Questions"):
628
  question_input = gr.Textbox(
 
639
  value="<p>Upload a file to see data preview...</p>"
640
  )
641
 
 
 
 
 
 
 
 
 
642
  with gr.Tab("πŸ” Raw Summary"):
643
  raw_summary = gr.Textbox(
644
  label="Detailed Data Summary",
 
649
 
650
  with gr.Tab("πŸ’Ύ Export Reports"):
651
  gr.Markdown("### πŸ“₯ Download Your Analysis Report")
 
652
  with gr.Row():
653
  format_choice = gr.Radio(
654
+ choices=["HTML", "Markdown"],
655
  value="HTML",
656
  label="πŸ“„ Report Format",
657
  info="Choose your preferred download format"
658
  )
 
659
  download_btn = gr.Button("πŸ“₯ Generate & Download Report", variant="primary", size="lg")
660
  download_status = gr.Textbox(label="Download Status", interactive=False)
661
  download_file = gr.File(label="πŸ“„ Download Link", visible=True)
662
 
 
663
  def update_file_stats(file):
664
  if not file:
665
  return "No file uploaded"
 
666
  try:
667
+ file_size = os.path.getsize(file.name) / (1024 * 1024)
668
  file_name = os.path.basename(file.name)
669
  return f"πŸ“„ **File**: {file_name}\nπŸ“ **Size**: {file_size:.2f} MB\n⏰ **Uploaded**: {datetime.now().strftime('%H:%M:%S')}"
670
  except:
671
  return "File information unavailable"
672
 
673
  def handle_analysis(file, api_key, user_question="", progress=gr.Progress()):
 
674
  result = sync_analyze_data(file, api_key, user_question, progress)
675
+ if len(result) == 5:
676
+ return result[0], result[1], result[2], result[4]
677
  else:
678
+ return result[0], result[1], result[2], ""
679
 
680
  def handle_question_analysis(file, api_key, question, progress=gr.Progress()):
 
681
  if not question.strip():
682
  return "❓ Please enter a specific question about your data."
 
683
  result = sync_analyze_data(file, api_key, question, progress)
684
+ return result[0]
685
 
 
686
  analyze_btn.click(
687
  fn=handle_analysis,
688
  inputs=[file_input, api_key_input, gr.Textbox(value="", visible=False)],
689
+ outputs=[analysis_output, raw_summary, data_preview, current_file_name],
690
  show_progress=True
691
  )
692
 
 
693
  ask_btn.click(
694
  fn=handle_question_analysis,
695
  inputs=[file_input, api_key_input, question_input],
 
697
  show_progress=True
698
  )
699
 
 
700
  file_input.change(
701
  fn=update_file_stats,
702
  inputs=[file_input],
703
  outputs=[file_stats]
704
  )
705
 
 
706
  clear_btn.click(
707
  fn=clear_all,
708
  outputs=[file_input, api_key_input, question_input, analysis_output,
709
+ question_output, data_preview, raw_summary, current_file_name]
710
  )
711
 
 
712
  download_btn.click(
713
  fn=download_report,
714
  inputs=[analysis_output, raw_summary, current_file_name, format_choice],
715
  outputs=[download_file, download_status]
716
  )
717
 
 
718
  gr.Markdown("""
719
  ---
720
  ### πŸ’‘ Pro Tips for Better Analysis:
 
724
  - Use descriptive column names
725
  - Ask specific questions like "What drives the highest profits?" instead of "Analyze this data"
726
 
 
 
 
 
 
 
 
727
  **πŸ“₯ Export Options:**
728
+ - **HTML**: Interactive report with embedded charts and print-to-PDF option
 
729
  - **Markdown**: Simple text format for documentation
730
 
731
  **⚑ Speed Optimization:**
 
736
  **πŸ”§ Supported Formats:** CSV, XLSX, XLS | **πŸ“ Max Size:** 50MB | **πŸš€ Response Time:** ~3-5 seconds
737
  """)
738
 
 
 
 
 
 
739
  if __name__ == "__main__":
740
+ app.queue(max_size=10)
741
+ app.launch()