rairo commited on
Commit
86cffd3
·
verified ·
1 Parent(s): deb9a1d

Update sozo_gen.py

Browse files
Files changed (1) hide show
  1. sozo_gen.py +562 -9
sozo_gen.py CHANGED
@@ -269,11 +269,539 @@ def sanitize_for_firebase_key(text: str) -> str:
269
 
270
  # REPLACE THE OLD generate_report_draft WITH THIS CORRECTED VERSION
271
  def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
272
- logging.info(f"Generating report draft for project {project_id}")
 
 
 
 
 
 
 
 
273
  df = load_dataframe_safely(buf, name)
 
 
274
  llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  ctx_dict = {"shape": df.shape, "columns": list(df.columns), "user_ctx": ctx}
276
  enhanced_ctx = enhance_data_context(df, ctx_dict)
 
277
  report_prompt = f"""
278
  You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
279
  **Dataset Analysis Context:** {json.dumps(enhanced_ctx, indent=2)}
@@ -284,37 +812,62 @@ def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, b
284
  Valid chart types: bar, pie, line, scatter, hist.
285
  Generate insights that would be valuable to C-level executives.
286
  """
 
287
  md = llm.invoke(report_prompt).content
288
  chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
289
  chart_urls = {}
290
  chart_generator = ChartGenerator(llm, df)
291
 
292
  for desc in chart_descs:
293
- # Create a safe key for Firebase
294
  safe_desc = sanitize_for_firebase_key(desc)
295
-
296
- # Replace the original description in the markdown with the safe one
297
  md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
298
- md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">') # Handle no quotes case
299
 
300
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
301
  img_path = Path(temp_file.name)
302
  try:
303
- chart_spec = chart_generator.generate_chart_spec(desc) # Still generate spec from original desc
304
  if execute_chart_spec(chart_spec, df, img_path):
305
  blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
306
  blob = bucket.blob(blob_name)
307
  blob.upload_from_filename(str(img_path))
308
-
309
- # Use the safe key in the dictionary
310
  chart_urls[safe_desc] = blob.public_url
311
- logging.info(f"Uploaded chart '{desc}' to {blob.public_url} with safe key '{safe_desc}'")
312
  finally:
313
  if os.path.exists(img_path):
314
  os.unlink(img_path)
315
 
316
  return {"raw_md": md, "chartUrls": chart_urls}
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  def generate_single_chart(df: pd.DataFrame, description: str, uid: str, project_id: str, bucket):
319
  logging.info(f"Generating single chart '{description}' for project {project_id}")
320
  llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
 
269
 
270
  # REPLACE THE OLD generate_report_draft WITH THIS CORRECTED VERSION
271
  def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
272
+ """
273
+ Enhanced autonomous data analysis function that intelligently analyzes any dataset
274
+ and generates comprehensive, domain-appropriate reports with contextual visualizations.
275
+
276
+ Maintains backward compatibility with existing function signature and outputs.
277
+ """
278
+ logging.info(f"Generating enhanced autonomous report draft for project {project_id}")
279
+
280
+ # Load data safely (existing functionality preserved)
281
  df = load_dataframe_safely(buf, name)
282
+
283
+ # Initialize LLM (existing setup preserved)
284
  llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
285
+
286
+ # Enhanced autonomous data analysis
287
+ try:
288
+ # Stage 1: Intelligent Data Classification and Deep Analysis
289
+ autonomous_context = perform_autonomous_data_analysis(df, ctx, name)
290
+
291
+ # Stage 2: Generate Enhanced Report with Intelligent Narrative
292
+ enhanced_report = generate_intelligent_report(llm, autonomous_context)
293
+
294
+ # Stage 3: Smart Chart Generation
295
+ chart_urls = generate_autonomous_charts(llm, df, enhanced_report, uid, project_id, bucket)
296
+
297
+ # Preserve original output structure
298
+ return {"raw_md": enhanced_report, "chartUrls": chart_urls}
299
+
300
+ except Exception as e:
301
+ logging.error(f"Enhanced analysis failed, falling back to original: {str(e)}")
302
+ # Fallback to original logic if enhancement fails
303
+ return generate_original_report(df, llm, ctx, uid, project_id, bucket)
304
+
305
+
306
+ def perform_autonomous_data_analysis(df: pd.DataFrame, user_ctx: str, filename: str) -> Dict[str, Any]:
307
+ """
308
+ Performs comprehensive autonomous analysis of the dataset to understand its nature,
309
+ domain, and analytical potential.
310
+ """
311
+ logging.info("Performing autonomous data analysis...")
312
+
313
+ # Basic data profiling
314
+ basic_info = {
315
+ "shape": df.shape,
316
+ "columns": list(df.columns),
317
+ "dtypes": df.dtypes.to_dict(),
318
+ "filename": filename,
319
+ "user_context": user_ctx
320
+ }
321
+
322
+ # Intelligent domain classification
323
+ domain_analysis = classify_dataset_domain(df, filename)
324
+
325
+ # Advanced statistical analysis
326
+ statistical_profile = generate_statistical_profile(df)
327
+
328
+ # Relationship discovery
329
+ relationships = discover_data_relationships(df)
330
+
331
+ # Temporal analysis if applicable
332
+ temporal_insights = analyze_temporal_patterns(df)
333
+
334
+ # Data quality assessment
335
+ quality_metrics = assess_data_quality(df)
336
+
337
+ # Business context inference
338
+ business_context = infer_business_context(df, domain_analysis)
339
+
340
+ return {
341
+ "basic_info": basic_info,
342
+ "domain": domain_analysis,
343
+ "statistics": statistical_profile,
344
+ "relationships": relationships,
345
+ "temporal": temporal_insights,
346
+ "quality": quality_metrics,
347
+ "business_context": business_context,
348
+ "analysis_complexity": determine_analysis_complexity(df, domain_analysis)
349
+ }
350
+
351
+
352
+ def classify_dataset_domain(df: pd.DataFrame, filename: str) -> Dict[str, Any]:
353
+ """
354
+ Intelligently classifies the dataset domain based on column patterns, data types,
355
+ and semantic analysis.
356
+ """
357
+ domain_indicators = {
358
+ "financial": ["amount", "price", "cost", "revenue", "profit", "transaction", "payment", "invoice"],
359
+ "survey": ["rating", "satisfaction", "response", "score", "survey", "feedback", "opinion"],
360
+ "scientific": ["measurement", "experiment", "test", "sample", "observation", "hypothesis", "variable"],
361
+ "marketing": ["campaign", "click", "conversion", "customer", "lead", "acquisition", "retention"],
362
+ "operational": ["process", "time", "duration", "status", "workflow", "performance", "efficiency"],
363
+ "sales": ["order", "product", "quantity", "sales", "customer", "deal", "pipeline"],
364
+ "hr": ["employee", "salary", "department", "performance", "training", "recruitment"],
365
+ "healthcare": ["patient", "diagnosis", "treatment", "medical", "health", "symptom", "medication"]
366
+ }
367
+
368
+ # Analyze column names for domain indicators
369
+ columns_lower = [col.lower() for col in df.columns]
370
+ domain_scores = {}
371
+
372
+ for domain, keywords in domain_indicators.items():
373
+ score = sum(1 for col in columns_lower for keyword in keywords if keyword in col)
374
+ domain_scores[domain] = score
375
+
376
+ # Filename analysis
377
+ filename_lower = filename.lower()
378
+ for domain, keywords in domain_indicators.items():
379
+ if any(keyword in filename_lower for keyword in keywords):
380
+ domain_scores[domain] = domain_scores.get(domain, 0) + 2
381
+
382
+ # Data type analysis
383
+ numeric_ratio = len(df.select_dtypes(include=[np.number]).columns) / len(df.columns)
384
+ categorical_ratio = len(df.select_dtypes(include=['object']).columns) / len(df.columns)
385
+
386
+ # Determine primary domain
387
+ primary_domain = max(domain_scores, key=domain_scores.get) if domain_scores else "general"
388
+
389
+ return {
390
+ "primary_domain": primary_domain,
391
+ "domain_confidence": domain_scores.get(primary_domain, 0),
392
+ "domain_scores": domain_scores,
393
+ "data_characteristics": {
394
+ "numeric_ratio": numeric_ratio,
395
+ "categorical_ratio": categorical_ratio,
396
+ "is_time_series": detect_time_series(df),
397
+ "is_transactional": detect_transactional_data(df),
398
+ "is_experimental": detect_experimental_data(df)
399
+ }
400
+ }
401
+
402
+
403
+ def generate_statistical_profile(df: pd.DataFrame) -> Dict[str, Any]:
404
+ """
405
+ Generates comprehensive statistical profile of the dataset.
406
+ """
407
+ profile = {
408
+ "summary_stats": {},
409
+ "correlations": {},
410
+ "distributions": {},
411
+ "outliers": {},
412
+ "missing_data": {}
413
+ }
414
+
415
+ # Summary statistics for numeric columns
416
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
417
+ if len(numeric_cols) > 0:
418
+ profile["summary_stats"] = df[numeric_cols].describe().to_dict()
419
+
420
+ # Correlation analysis
421
+ if len(numeric_cols) > 1:
422
+ corr_matrix = df[numeric_cols].corr()
423
+ # Find strong correlations
424
+ strong_corrs = []
425
+ for i in range(len(corr_matrix.columns)):
426
+ for j in range(i+1, len(corr_matrix.columns)):
427
+ corr_val = corr_matrix.iloc[i, j]
428
+ if abs(corr_val) > 0.7: # Strong correlation threshold
429
+ strong_corrs.append({
430
+ "var1": corr_matrix.columns[i],
431
+ "var2": corr_matrix.columns[j],
432
+ "correlation": corr_val
433
+ })
434
+ profile["correlations"] = {"strong_correlations": strong_corrs}
435
+
436
+ # Categorical analysis
437
+ categorical_cols = df.select_dtypes(include=['object']).columns
438
+ if len(categorical_cols) > 0:
439
+ profile["categorical_analysis"] = {}
440
+ for col in categorical_cols:
441
+ profile["categorical_analysis"][col] = {
442
+ "unique_count": df[col].nunique(),
443
+ "top_values": df[col].value_counts().head(5).to_dict()
444
+ }
445
+
446
+ # Missing data analysis
447
+ missing_data = df.isnull().sum()
448
+ profile["missing_data"] = {
449
+ "columns_with_missing": missing_data[missing_data > 0].to_dict(),
450
+ "total_missing_percentage": (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
451
+ }
452
+
453
+ return profile
454
+
455
+
456
+ def discover_data_relationships(df: pd.DataFrame) -> Dict[str, Any]:
457
+ """
458
+ Discovers meaningful relationships and patterns in the data.
459
+ """
460
+ relationships = {
461
+ "key_relationships": [],
462
+ "patterns": [],
463
+ "anomalies": []
464
+ }
465
+
466
+ # Identify potential key relationships
467
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
468
+
469
+ if len(numeric_cols) > 1:
470
+ # Find interesting relationships
471
+ for col1 in numeric_cols:
472
+ for col2 in numeric_cols:
473
+ if col1 != col2:
474
+ correlation = df[col1].corr(df[col2])
475
+ if abs(correlation) > 0.5: # Moderate to strong correlation
476
+ relationships["key_relationships"].append({
477
+ "variable1": col1,
478
+ "variable2": col2,
479
+ "relationship_strength": correlation,
480
+ "relationship_type": "positive" if correlation > 0 else "negative"
481
+ })
482
+
483
+ # Identify patterns in categorical data
484
+ categorical_cols = df.select_dtypes(include=['object']).columns
485
+ for col in categorical_cols:
486
+ if df[col].nunique() < 20: # Reasonable number of categories
487
+ value_counts = df[col].value_counts()
488
+ if len(value_counts) > 0:
489
+ relationships["patterns"].append({
490
+ "column": col,
491
+ "pattern_type": "categorical_distribution",
492
+ "dominant_category": value_counts.index[0],
493
+ "dominance_percentage": (value_counts.iloc[0] / len(df)) * 100
494
+ })
495
+
496
+ return relationships
497
+
498
+
499
+ def analyze_temporal_patterns(df: pd.DataFrame) -> Dict[str, Any]:
500
+ """
501
+ Analyzes temporal patterns if time-based columns are detected.
502
+ """
503
+ temporal_insights = {"has_temporal_data": False}
504
+
505
+ # Detect date/time columns
506
+ date_columns = []
507
+ for col in df.columns:
508
+ if df[col].dtype == 'datetime64[ns]' or 'date' in col.lower() or 'time' in col.lower():
509
+ try:
510
+ pd.to_datetime(df[col])
511
+ date_columns.append(col)
512
+ except:
513
+ continue
514
+
515
+ if date_columns:
516
+ temporal_insights["has_temporal_data"] = True
517
+ temporal_insights["date_columns"] = date_columns
518
+
519
+ # Analyze temporal patterns for the first date column
520
+ primary_date_col = date_columns[0]
521
+ df_temp = df.copy()
522
+ df_temp[primary_date_col] = pd.to_datetime(df_temp[primary_date_col])
523
+
524
+ temporal_insights["temporal_analysis"] = {
525
+ "date_range": {
526
+ "start": df_temp[primary_date_col].min().strftime('%Y-%m-%d'),
527
+ "end": df_temp[primary_date_col].max().strftime('%Y-%m-%d')
528
+ },
529
+ "time_span_days": (df_temp[primary_date_col].max() - df_temp[primary_date_col].min()).days,
530
+ "frequency": detect_temporal_frequency(df_temp[primary_date_col])
531
+ }
532
+
533
+ return temporal_insights
534
+
535
+
536
+ def assess_data_quality(df: pd.DataFrame) -> Dict[str, Any]:
537
+ """
538
+ Assesses data quality and identifies potential issues.
539
+ """
540
+ quality_metrics = {
541
+ "overall_quality_score": 0,
542
+ "quality_issues": [],
543
+ "data_completeness": 0,
544
+ "data_consistency": {}
545
+ }
546
+
547
+ # Completeness assessment
548
+ completeness = (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
549
+ quality_metrics["data_completeness"] = completeness
550
+
551
+ # Identify quality issues
552
+ if completeness < 95:
553
+ quality_metrics["quality_issues"].append("Missing data detected")
554
+
555
+ # Check for duplicates
556
+ duplicate_rows = df.duplicated().sum()
557
+ if duplicate_rows > 0:
558
+ quality_metrics["quality_issues"].append(f"{duplicate_rows} duplicate rows found")
559
+
560
+ # Check for inconsistent data types
561
+ for col in df.columns:
562
+ if df[col].dtype == 'object':
563
+ if df[col].str.isnumeric().any() and not df[col].str.isnumeric().all():
564
+ quality_metrics["quality_issues"].append(f"Inconsistent data types in {col}")
565
+
566
+ # Calculate overall quality score
567
+ base_score = 100
568
+ base_score -= (100 - completeness) * 0.5 # Penalize missing data
569
+ base_score -= len(quality_metrics["quality_issues"]) * 5 # Penalize each quality issue
570
+ quality_metrics["overall_quality_score"] = max(0, base_score)
571
+
572
+ return quality_metrics
573
+
574
+
575
+ def infer_business_context(df: pd.DataFrame, domain_analysis: Dict[str, Any]) -> Dict[str, Any]:
576
+ """
577
+ Infers business context and potential use cases based on the data characteristics.
578
+ """
579
+ domain = domain_analysis["primary_domain"]
580
+
581
+ context_mapping = {
582
+ "financial": {
583
+ "key_metrics": ["Revenue", "Profit", "Cost", "ROI"],
584
+ "typical_analyses": ["Trend analysis", "Profitability analysis", "Budget vs actual"],
585
+ "stakeholders": ["CFO", "Finance team", "Executive leadership"]
586
+ },
587
+ "survey": {
588
+ "key_metrics": ["Satisfaction scores", "Response rates", "Sentiment"],
589
+ "typical_analyses": ["Satisfaction analysis", "Demographic breakdown", "Correlation analysis"],
590
+ "stakeholders": ["Marketing team", "Product managers", "Customer success"]
591
+ },
592
+ "scientific": {
593
+ "key_metrics": ["Statistical significance", "Effect size", "Confidence intervals"],
594
+ "typical_analyses": ["Hypothesis testing", "Regression analysis", "Experimental validation"],
595
+ "stakeholders": ["Researchers", "Scientists", "Academic community"]
596
+ },
597
+ "marketing": {
598
+ "key_metrics": ["Conversion rates", "Customer acquisition cost", "Campaign ROI"],
599
+ "typical_analyses": ["Campaign performance", "Customer segmentation", "Attribution analysis"],
600
+ "stakeholders": ["Marketing team", "CMO", "Sales team"]
601
+ }
602
+ }
603
+
604
+ return context_mapping.get(domain, {
605
+ "key_metrics": ["Performance indicators", "Trends", "Patterns"],
606
+ "typical_analyses": ["Descriptive analysis", "Trend identification", "Pattern recognition"],
607
+ "stakeholders": ["Business stakeholders", "Decision makers"]
608
+ })
609
+
610
+
611
+ def generate_intelligent_report(llm, autonomous_context: Dict[str, Any]) -> str:
612
+ """
613
+ Generates an intelligent, domain-appropriate report with organic storytelling.
614
+ """
615
+ # Create truly autonomous prompt that lets AI decide everything
616
+ enhanced_prompt = f"""
617
+ You are a world-class data analyst who has just been handed this dataset to analyze. Look at the data characteristics and tell me the most compelling story you can find.
618
+
619
+ **DATASET CONTEXT:**
620
+ {json.dumps(autonomous_context, indent=2)}
621
+
622
+ **YOUR MISSION:**
623
+ Analyze this data like you would if a CEO walked into your office and said "I need to understand what this data is telling us." Write a report that would make them say "This is exactly what I needed to know."
624
+
625
+ **GUIDELINES:**
626
+ - Don't follow a rigid structure - let the data guide your narrative
627
+ - Choose your own headings and sections based on what the data reveals
628
+ - Write like you're presenting findings to someone who needs to make important decisions
629
+ - Include specific numbers and insights that matter
630
+ - Insert chart recommendations like: `<generate_chart: "chart_type | description">`
631
+ - Valid chart types: bar, pie, line, scatter, hist, box, heatmap
632
+ - Only recommend charts that truly support your narrative
633
+
634
+ **FORGET TEMPLATES - TELL THE STORY:**
635
+ What's the most interesting, important, or surprising thing this data reveals? Start there and build your entire report around that central insight. Make it compelling, make it actionable, make it memorable.
636
+
637
+ Be the data analyst who gets promoted because they don't just present data - they reveal insights that drive business decisions.
638
+ """
639
+
640
+
641
+ # Removed - no longer needed since we're letting AI decide everything organically
642
+
643
+
644
+ def generate_autonomous_charts(llm, df: pd.DataFrame, report_md: str, uid: str, project_id: str, bucket) -> Dict[str, str]:
645
+ """
646
+ Generates charts autonomously based on the report content and data characteristics.
647
+ """
648
+ # Extract chart descriptions from the enhanced report
649
+ chart_descs = extract_chart_tags(report_md)[:MAX_CHARTS]
650
+ chart_urls = {}
651
+
652
+ if not chart_descs:
653
+ # If no charts specified, generate intelligent defaults
654
+ chart_descs = generate_intelligent_chart_suggestions(df, llm)
655
+
656
+ chart_generator = ChartGenerator(llm, df)
657
+
658
+ for desc in chart_descs:
659
+ try:
660
+ # Create a safe key for Firebase
661
+ safe_desc = sanitize_for_firebase_key(desc)
662
+
663
+ # Replace chart tags in markdown
664
+ report_md = report_md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
665
+ report_md = report_md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
666
+
667
+ # Generate chart
668
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
669
+ img_path = Path(temp_file.name)
670
+ try:
671
+ chart_spec = chart_generator.generate_chart_spec(desc)
672
+ if execute_chart_spec(chart_spec, df, img_path):
673
+ blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
674
+ blob = bucket.blob(blob_name)
675
+ blob.upload_from_filename(str(img_path))
676
+
677
+ chart_urls[safe_desc] = blob.public_url
678
+ logging.info(f"Generated autonomous chart: {safe_desc}")
679
+ finally:
680
+ if os.path.exists(img_path):
681
+ os.unlink(img_path)
682
+
683
+ except Exception as e:
684
+ logging.error(f"Failed to generate chart '{desc}': {str(e)}")
685
+ continue
686
+
687
+ return chart_urls
688
+
689
+
690
+ def generate_intelligent_chart_suggestions(df: pd.DataFrame, llm) -> List[str]:
691
+ """
692
+ Generates intelligent chart suggestions based on data characteristics.
693
+ """
694
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
695
+ categorical_cols = df.select_dtypes(include=['object']).columns
696
+
697
+ suggestions = []
698
+
699
+ # Time series chart if temporal data exists
700
+ if detect_time_series(df):
701
+ suggestions.append("line | Time series trend analysis | Show temporal patterns")
702
+
703
+ # Distribution chart for numeric data
704
+ if len(numeric_cols) > 0:
705
+ main_numeric = numeric_cols[0]
706
+ suggestions.append(f"hist | Distribution of {main_numeric} | Understand data distribution")
707
+
708
+ # Correlation analysis if multiple numeric columns
709
+ if len(numeric_cols) > 1:
710
+ suggestions.append("scatter | Correlation analysis | Identify relationships between variables")
711
+
712
+ # Categorical breakdown
713
+ if len(categorical_cols) > 0:
714
+ main_categorical = categorical_cols[0]
715
+ suggestions.append(f"bar | {main_categorical} breakdown | Show categorical distribution")
716
+
717
+ return suggestions[:MAX_CHARTS]
718
+
719
+
720
+ # Helper functions (preserve existing functionality)
721
+ def detect_time_series(df: pd.DataFrame) -> bool:
722
+ """Detect if dataset contains time series data."""
723
+ for col in df.columns:
724
+ if 'date' in col.lower() or 'time' in col.lower():
725
+ return True
726
+ try:
727
+ pd.to_datetime(df[col])
728
+ return True
729
+ except:
730
+ continue
731
+ return False
732
+
733
+
734
+ def detect_transactional_data(df: pd.DataFrame) -> bool:
735
+ """Detect if dataset contains transactional data."""
736
+ transaction_indicators = ['transaction', 'payment', 'order', 'invoice', 'amount', 'quantity']
737
+ columns_lower = [col.lower() for col in df.columns]
738
+ return any(indicator in col for col in columns_lower for indicator in transaction_indicators)
739
+
740
+
741
+ def detect_experimental_data(df: pd.DataFrame) -> bool:
742
+ """Detect if dataset contains experimental data."""
743
+ experimental_indicators = ['test', 'experiment', 'trial', 'group', 'treatment', 'control']
744
+ columns_lower = [col.lower() for col in df.columns]
745
+ return any(indicator in col for col in columns_lower for indicator in experimental_indicators)
746
+
747
+
748
+ def detect_temporal_frequency(date_series: pd.Series) -> str:
749
+ """Detect the frequency of temporal data."""
750
+ if len(date_series) < 2:
751
+ return "insufficient_data"
752
+
753
+ # Calculate time differences
754
+ time_diffs = date_series.sort_values().diff().dropna()
755
+ median_diff = time_diffs.median()
756
+
757
+ if median_diff <= pd.Timedelta(days=1):
758
+ return "daily"
759
+ elif median_diff <= pd.Timedelta(days=7):
760
+ return "weekly"
761
+ elif median_diff <= pd.Timedelta(days=31):
762
+ return "monthly"
763
+ else:
764
+ return "irregular"
765
+
766
+
767
+ def determine_analysis_complexity(df: pd.DataFrame, domain_analysis: Dict[str, Any]) -> str:
768
+ """Determine the complexity level of analysis required."""
769
+ complexity_factors = 0
770
+
771
+ # Data size factor
772
+ if len(df) > 10000:
773
+ complexity_factors += 1
774
+ if len(df.columns) > 20:
775
+ complexity_factors += 1
776
+
777
+ # Data type diversity
778
+ if len(df.select_dtypes(include=[np.number]).columns) > 5:
779
+ complexity_factors += 1
780
+ if len(df.select_dtypes(include=['object']).columns) > 5:
781
+ complexity_factors += 1
782
+
783
+ # Domain complexity
784
+ if domain_analysis["primary_domain"] in ["scientific", "financial"]:
785
+ complexity_factors += 1
786
+
787
+ if complexity_factors >= 3:
788
+ return "high"
789
+ elif complexity_factors >= 2:
790
+ return "medium"
791
+ else:
792
+ return "low"
793
+
794
+
795
+ def generate_original_report(df: pd.DataFrame, llm, ctx: str, uid: str, project_id: str, bucket) -> Dict[str, str]:
796
+ """
797
+ Fallback to original report generation logic if enhanced version fails.
798
+ """
799
+ logging.info("Using fallback report generation")
800
+
801
+ # Original logic preserved
802
  ctx_dict = {"shape": df.shape, "columns": list(df.columns), "user_ctx": ctx}
803
  enhanced_ctx = enhance_data_context(df, ctx_dict)
804
+
805
  report_prompt = f"""
806
  You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
807
  **Dataset Analysis Context:** {json.dumps(enhanced_ctx, indent=2)}
 
812
  Valid chart types: bar, pie, line, scatter, hist.
813
  Generate insights that would be valuable to C-level executives.
814
  """
815
+
816
  md = llm.invoke(report_prompt).content
817
  chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
818
  chart_urls = {}
819
  chart_generator = ChartGenerator(llm, df)
820
 
821
  for desc in chart_descs:
 
822
  safe_desc = sanitize_for_firebase_key(desc)
 
 
823
  md = md.replace(f'<generate_chart: "{desc}">', f'<generate_chart: "{safe_desc}">')
824
+ md = md.replace(f'<generate_chart: {desc}>', f'<generate_chart: "{safe_desc}">')
825
 
826
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
827
  img_path = Path(temp_file.name)
828
  try:
829
+ chart_spec = chart_generator.generate_chart_spec(desc)
830
  if execute_chart_spec(chart_spec, df, img_path):
831
  blob_name = f"sozo_projects/{uid}/{project_id}/charts/{uuid.uuid4().hex}.png"
832
  blob = bucket.blob(blob_name)
833
  blob.upload_from_filename(str(img_path))
 
 
834
  chart_urls[safe_desc] = blob.public_url
 
835
  finally:
836
  if os.path.exists(img_path):
837
  os.unlink(img_path)
838
 
839
  return {"raw_md": md, "chartUrls": chart_urls}
840
 
841
+
842
+ def generate_fallback_report(autonomous_context: Dict[str, Any]) -> str:
843
+ """
844
+ Generates a basic fallback report when enhanced generation fails.
845
+ """
846
+ basic_info = autonomous_context["basic_info"]
847
+ domain = autonomous_context["domain"]["primary_domain"]
848
+
849
+ return f"""
850
+ # What This Data Reveals
851
+
852
+ Looking at this {domain} dataset with {basic_info['shape'][0]} records, there are several key insights worth highlighting.
853
+
854
+ ## The Numbers Tell a Story
855
+
856
+ This dataset contains {basic_info['shape'][1]} different variables, suggesting a comprehensive view of the underlying processes or behaviors being measured.
857
+
858
+ <generate_chart: "bar | Data overview showing key metrics">
859
+
860
+ ## What You Should Know
861
+
862
+ The data structure and patterns suggest this is worth deeper investigation. The variety of data types and relationships indicate multiple analytical opportunities.
863
+
864
+ ## Next Steps
865
+
866
+ Based on this initial analysis, I recommend diving deeper into the specific patterns and relationships within the data to unlock more actionable insights.
867
+
868
+ *Note: This is a simplified analysis. Enhanced storytelling temporarily unavailable.*
869
+ """
870
+
871
  def generate_single_chart(df: pd.DataFrame, description: str, uid: str, project_id: str, bucket):
872
  logging.info(f"Generating single chart '{description}' for project {project_id}")
873
  llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)